Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1414,6 +1414,26 @@
     default:
       break;
 
+    case Intrinsic::powi:
+      if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
+        bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
+        if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
+                                               ShouldOptForSize)) {
+          // The cost is modeled on the expansion performed by ExpandPowI in
+          // SelectionDAGBuilder.
+          unsigned ActiveBits = RHSC->getValue().getActiveBits();
+          unsigned PopCount =
+              countPopulation((unsigned int)RHSC->getSExtValue());
+          InstructionCost Cost = (ActiveBits + PopCount - 2) *
+                                 thisT()->getArithmeticInstrCost(
+                                     Instruction::FMul, RetTy, CostKind);
+          if (RHSC->getSExtValue() < 0)
+            Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
+                                                    CostKind);
+          return Cost;
+        }
+      }
+      break;
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2196,6 +2196,18 @@
     return false;
   }
 
+  /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
+  /// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
+  /// considered beneficial.
+  /// If optimizing for size, expansion is only considered beneficial for upto
+  /// 5 multiplies and a divide (if the exponent is negative).
+  bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const {
+    if (Exponent < 0)
+      Exponent = -Exponent;
+    return !OptForSize ||
+           (countPopulation((unsigned int)Exponent) + Log2_32(Exponent) < 7);
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5346,38 +5346,36 @@
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
-  // If RHS is a constant, we can expand this out to a multiplication tree,
-  // otherwise we end up lowering to a call to __powidf2 (for example).  When
-  // optimizing for size, we only want to do this if the expansion would produce
-  // a small number of multiplies, otherwise we do the full expansion.
+  // If RHS is a constant, we can expand this out to a multiplication tree if
+  // it's beneficial on the target, otherwise we end up lowering to a call to
+  // __powidf2 (for example).
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-    // Get the exponent as a positive value.
     unsigned Val = RHSC->getSExtValue();
-    if ((int)Val < 0) Val = -Val;
 
     // powi(x, 0) -> 1.0
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
-    bool OptForSize = DAG.shouldOptForSize();
-    if (!OptForSize ||
-        // If optimizing for size, don't insert too many multiplies.
-        // This inserts up to 5 multiplies.
-        countPopulation(Val) + Log2_32(Val) < 7) {
+    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
+            Val, DAG.shouldOptForSize())) {
+      // Get the exponent as a positive value.
+      if ((int)Val < 0)
+        Val = -Val;
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
       // the benefit of being both really simple and much better than a libcall.
-      SDValue Res;  // Logically starts equal to 1.0
+      SDValue Res; // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
       // TODO: Intrinsics should have fast-math-flags that propagate to these
       // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
-            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+            Res =
+                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
           else
-            Res = CurSquare;  // 1.0*CurSquare.
+            Res = CurSquare; // 1.0*CurSquare.
         }
 
         CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
===================================================================
--- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -225,12 +225,12 @@
 declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
-define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
+define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -242,7 +242,7 @@
   %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
   %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
   %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
   %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
   %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -251,6 +251,15 @@
   ret void
 }
 
+define void @powi(<vscale x 4 x float> %vec) {
+; CHECK-LABEL: 'powi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  ret void
+}
+
 declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
Index: llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
===================================================================
--- llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes='print<cost-model>' 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s
 
-define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
+define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -20,7 +20,7 @@
   %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
   %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
   %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
   %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
   %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -31,6 +31,15 @@
   ret void
 }
 
+define void @powi(<vscale x 4 x float> %vec) {
+; CHECK-LABEL: 'powi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  ret void
+}
+
 define void @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c) {
 ; CHECK-LABEL: 'fshr'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
Index: llvm/test/Analysis/CostModel/X86/powi.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/powi.ll
+++ llvm/test/Analysis/CostModel/X86/powi.ll
@@ -74,55 +74,55 @@
 
 define i32 @powi_3() {
 ; SSE-LABEL: 'powi_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 3)
@@ -142,55 +142,55 @@
 
 define i32 @powi_n3() {
 ; SSE-LABEL: 'powi_n3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 644 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 764 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1528 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_n3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 664 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_n3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 356 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_n3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 -3)
@@ -210,25 +210,25 @@
 
 define i32 @powi_6() {
 ; SSE-LABEL: 'powi_6'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_6'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
@@ -236,29 +236,29 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_6'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_6'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 6)
@@ -278,55 +278,55 @@
 
 define i32 @powi_16() {
 ; SSE-LABEL: 'powi_16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 16)
Index: llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
@@ -6,13 +6,8 @@
 define <2 x double> @PR53887_v2f64(<2 x double> noundef %x) {
 ; CHECK-LABEL: @PR53887_v2f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT]], i32 6)
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[X]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT1]], i32 6)
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <2 x double> [[VECINIT]], double [[TMP1]], i64 1
-; CHECK-NEXT:    ret <2 x double> [[VECINIT3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <2 x double> %x, i64 0
@@ -27,20 +22,8 @@
 define <4 x double> @PR53887_v4f64(<4 x double> noundef %x) {
 ; CHECK-LABEL: @PR53887_v4f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x double> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x double> [[X]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[VECEXT]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[VECEXT1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP1]], i32 6)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x double> [[X]], i64 2
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x double> [[X]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[VECEXT4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[VECEXT7]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 6)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[VECINIT91:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[VECINIT91]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <4 x double> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x double> %x, i64 0
Index: llvm/test/Transforms/SLPVectorizer/X86/powi.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powi.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powi.ll
@@ -1,18 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s
 
 define <2 x double> @buildvector_powi_2f64_6(<2 x double> %a) {
 ; CHECK-LABEL: @buildvector_powi_2f64_6(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6)
-; CHECK-NEXT:    [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6)
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[A:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -43,69 +38,9 @@
 }
 
 define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
-; SSE-LABEL: @buildvector_powi_4f32_3(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f32_3(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX2-LABEL: @buildvector_powi_4f32_3(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX2-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX2-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX2-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX2-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX2-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX512-LABEL: @buildvector_powi_4f32_3(
-; AVX512-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX512-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX512-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX512-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX512-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[R31]]
+; CHECK-LABEL: @buildvector_powi_4f32_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[A:%.*]], i32 3)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -127,45 +62,9 @@
 ;
 
 define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
-; SSE-LABEL: @buildvector_powi_4f64_16(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f64_16(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX2-LABEL: @buildvector_powi_4f64_16(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX2-NEXT:    ret <4 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_4f64_16(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX512-NEXT:    ret <4 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_4f64_16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -183,66 +82,9 @@
 }
 
 define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
-; SSE-LABEL: @buildvector_powi_8f32_4(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 4)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP9]], i32 4)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f32_4(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 4)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 4)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 4)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 4)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 4)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A6]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A7]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 4)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R32:%.*]] = shufflevector <8 x float> [[R1]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R32]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX1-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX2-LABEL: @buildvector_powi_8f32_4(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX2-NEXT:    ret <8 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f32_4(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX512-NEXT:    ret <8 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f32_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -276,61 +118,9 @@
 ;
 
 define <8 x double> @buildvector_powi_8f64_5(<8 x double> %a) {
-; SSE-LABEL: @buildvector_powi_8f64_5(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f64_5(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; AVX1-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; AVX1-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX2-LABEL: @buildvector_powi_8f64_5(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX2-NEXT:    ret <8 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f64_5(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX512-NEXT:    ret <8 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f64_5(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -415,108 +205,9 @@
 }
 
 define <16 x float> @buildvector_powi_16f32_n13(<16 x float> %a) {
-; SSE-LABEL: @buildvector_powi_16f32_n13(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; SSE-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; SSE-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; SSE-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; SSE-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; SSE-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; SSE-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; SSE-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; SSE-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[A4]], i32 4
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[A5]], i32 5
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[A6]], i32 6
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[A7]], i32 7
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP8]], i32 -13)
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[A8]], i32 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[A9]], i32 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[A10]], i32 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[A11]], i32 3
-; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[A12]], i32 4
-; SSE-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[A13]], i32 5
-; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[A14]], i32 6
-; SSE-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[A15]], i32 7
-; SSE-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP17]], i32 -13)
-; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP18]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:    ret <16 x float> [[R151]]
-;
-; AVX1-LABEL: @buildvector_powi_16f32_n13(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; AVX1-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; AVX1-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; AVX1-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; AVX1-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; AVX1-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; AVX1-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; AVX1-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; AVX1-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 -13)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 -13)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 -13)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 -13)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 -13)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A6]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A7]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 -13)
-; AVX1-NEXT:    [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 -13)
-; AVX1-NEXT:    [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 -13)
-; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A10]], i32 0
-; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A11]], i32 1
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP8]], i32 -13)
-; AVX1-NEXT:    [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 -13)
-; AVX1-NEXT:    [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 -13)
-; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[A14]], i32 0
-; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP10]], float [[A15]], i32 1
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP11]], i32 -13)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R34:%.*]] = shufflevector <16 x float> [[R1]], <16 x float> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R34]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R73:%.*]] = shufflevector <16 x float> [[R5]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R73]], float [[C8]], i32 8
-; AVX1-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; AVX1-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R112:%.*]] = shufflevector <16 x float> [[R9]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R112]], float [[C12]], i32 12
-; AVX1-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; AVX1-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[R13]], <16 x float> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:    ret <16 x float> [[R151]]
-;
-; AVX2-LABEL: @buildvector_powi_16f32_n13(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX2-NEXT:    ret <16 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_16f32_n13(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX512-NEXT:    ret <16 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_16f32_n13(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1