Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1414,6 +1414,26 @@
     default:
       break;
 
+    case Intrinsic::powi:
+      if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
+        bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
+        if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
+                                               ShouldOptForSize)) {
+          // The cost is modeled on the expansion performed by ExpandPowI in
+          // SelectionDAGBuilder.
+          APInt Exponent = RHSC->getValue().abs();
+          unsigned ActiveBits = Exponent.getActiveBits();
+          unsigned PopCount = Exponent.countPopulation();
+          InstructionCost Cost = (ActiveBits + PopCount - 2) *
+                                 thisT()->getArithmeticInstrCost(
+                                     Instruction::FMul, RetTy, CostKind);
+          if (RHSC->getSExtValue() < 0)
+            Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
+                                                    CostKind);
+          return Cost;
+        }
+      }
+      break;
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2196,6 +2196,18 @@
     return false;
   }
 
+  /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
+  /// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
+  /// considered beneficial.
+  /// If optimizing for size, expansion is only considered beneficial for upto
+  /// 5 multiplies and a divide (if the exponent is negative).
+  bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const {
+    if (Exponent < 0)
+      Exponent = -Exponent;
+    return !OptForSize ||
+           (countPopulation((unsigned int)Exponent) + Log2_32(Exponent) < 7);
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5346,38 +5346,36 @@
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
-  // If RHS is a constant, we can expand this out to a multiplication tree,
-  // otherwise we end up lowering to a call to __powidf2 (for example).  When
-  // optimizing for size, we only want to do this if the expansion would produce
-  // a small number of multiplies, otherwise we do the full expansion.
+  // If RHS is a constant, we can expand this out to a multiplication tree if
+  // it's beneficial on the target, otherwise we end up lowering to a call to
+  // __powidf2 (for example).
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-    // Get the exponent as a positive value.
     unsigned Val = RHSC->getSExtValue();
-    if ((int)Val < 0) Val = -Val;
 
     // powi(x, 0) -> 1.0
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
-    bool OptForSize = DAG.shouldOptForSize();
-    if (!OptForSize ||
-        // If optimizing for size, don't insert too many multiplies.
-        // This inserts up to 5 multiplies.
-        countPopulation(Val) + Log2_32(Val) < 7) {
+    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
+            Val, DAG.shouldOptForSize())) {
+      // Get the exponent as a positive value.
+      if ((int)Val < 0)
+        Val = -Val;
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
       // the benefit of being both really simple and much better than a libcall.
-      SDValue Res;  // Logically starts equal to 1.0
+      SDValue Res; // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
       // TODO: Intrinsics should have fast-math-flags that propagate to these
       // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
-            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+            Res =
+                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
           else
-            Res = CurSquare;  // 1.0*CurSquare.
+            Res = CurSquare; // 1.0*CurSquare.
         }
 
         CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
===================================================================
--- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -225,12 +225,12 @@
 declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
-define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
+define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -242,7 +242,7 @@
   %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
   %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
   %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
   %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
   %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -251,6 +251,15 @@
   ret void
 }
 
+define void @powi(<vscale x 4 x float> %vec) {
+; CHECK-LABEL: 'powi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  ret void
+}
+
 declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
Index: llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
===================================================================
--- llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes='print<cost-model>' 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s
 
-define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
+define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -20,7 +20,7 @@
   %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
   %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
   %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
   %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
   %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
@@ -31,6 +31,15 @@
   ret void
 }
 
+define void @powi(<vscale x 4 x float> %vec) {
+; CHECK-LABEL: 'powi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+  ret void
+}
+
 define void @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c) {
 ; CHECK-LABEL: 'fshr'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
Index: llvm/test/Analysis/CostModel/X86/powi.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/powi.ll
+++ llvm/test/Analysis/CostModel/X86/powi.ll
@@ -74,55 +74,55 @@
 
 define i32 @powi_3() {
 ; SSE-LABEL: 'powi_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 3)
@@ -142,55 +142,55 @@
 
 define i32 @powi_n3() {
 ; SSE-LABEL: 'powi_n3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 292 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 584 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_n3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_n3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_n3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 -3)
@@ -210,25 +210,25 @@
 
 define i32 @powi_6() {
 ; SSE-LABEL: 'powi_6'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_6'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
@@ -236,29 +236,29 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_6'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_6'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 6)
@@ -278,55 +278,55 @@
 
 define i32 @powi_16() {
 ; SSE-LABEL: 'powi_16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 16)
Index: llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
@@ -6,13 +6,8 @@
 define <2 x double> @PR53887_v2f64(<2 x double> noundef %x) {
 ; CHECK-LABEL: @PR53887_v2f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT]], i32 6)
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[X]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT1]], i32 6)
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <2 x double> [[VECINIT]], double [[TMP1]], i64 1
-; CHECK-NEXT:    ret <2 x double> [[VECINIT3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <2 x double> %x, i64 0
@@ -27,20 +22,8 @@
 define <4 x double> @PR53887_v4f64(<4 x double> noundef %x) {
 ; CHECK-LABEL: @PR53887_v4f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x double> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x double> [[X]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[VECEXT]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[VECEXT1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP1]], i32 6)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x double> [[X]], i64 2
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x double> [[X]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[VECEXT4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[VECEXT7]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 6)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[VECINIT91:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[VECINIT91]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <4 x double> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x double> %x, i64 0
Index: llvm/test/Transforms/SLPVectorizer/X86/powi.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powi.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powi.ll
@@ -1,18 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s
 
 define <2 x double> @buildvector_powi_2f64_6(<2 x double> %a) {
 ; CHECK-LABEL: @buildvector_powi_2f64_6(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6)
-; CHECK-NEXT:    [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6)
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[A:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -43,69 +38,9 @@
 }
 
 define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
-; SSE-LABEL: @buildvector_powi_4f32_3(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f32_3(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX2-LABEL: @buildvector_powi_4f32_3(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX2-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX2-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX2-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX2-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX2-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX512-LABEL: @buildvector_powi_4f32_3(
-; AVX512-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX512-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX512-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX512-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX512-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[R31]]
+; CHECK-LABEL: @buildvector_powi_4f32_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[A:%.*]], i32 3)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -127,45 +62,9 @@
 ;
 
 define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
-; SSE-LABEL: @buildvector_powi_4f64_16(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f64_16(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX2-LABEL: @buildvector_powi_4f64_16(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX2-NEXT:    ret <4 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_4f64_16(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX512-NEXT:    ret <4 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_4f64_16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -183,66 +82,9 @@
 }
 
 define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
-; SSE-LABEL: @buildvector_powi_8f32_4(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 4)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP9]], i32 4)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f32_4(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 4)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 4)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 4)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 4)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 4)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A6]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A7]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 4)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R32:%.*]] = shufflevector <8 x float> [[R1]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R32]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX1-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX2-LABEL: @buildvector_powi_8f32_4(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX2-NEXT:    ret <8 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f32_4(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX512-NEXT:    ret <8 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f32_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -276,61 +118,9 @@
 ;
 
 define <8 x double> @buildvector_powi_8f64_5(<8 x double> %a) {
-; SSE-LABEL: @buildvector_powi_8f64_5(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f64_5(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; AVX1-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; AVX1-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX2-LABEL: @buildvector_powi_8f64_5(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX2-NEXT:    ret <8 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f64_5(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX512-NEXT:    ret <8 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f64_5(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -415,108 +205,9 @@
 }
 
 define <16 x float> @buildvector_powi_16f32_n13(<16 x float> %a) {
-; SSE-LABEL: @buildvector_powi_16f32_n13(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; SSE-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; SSE-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; SSE-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; SSE-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; SSE-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; SSE-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; SSE-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; SSE-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[A4]], i32 4
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[A5]], i32 5
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[A6]], i32 6
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[A7]], i32 7
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP8]], i32 -13)
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[A8]], i32 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[A9]], i32 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[A10]], i32 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[A11]], i32 3
-; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[A12]], i32 4
-; SSE-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[A13]], i32 5
-; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[A14]], i32 6
-; SSE-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[A15]], i32 7
-; SSE-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP17]], i32 -13)
-; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP18]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:    ret <16 x float> [[R151]]
-;
-; AVX1-LABEL: @buildvector_powi_16f32_n13(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; AVX1-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; AVX1-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; AVX1-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; AVX1-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; AVX1-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; AVX1-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; AVX1-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; AVX1-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 -13)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 -13)
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 -13)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 -13)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 -13)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A6]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A7]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 -13)
-; AVX1-NEXT:    [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 -13)
-; AVX1-NEXT:    [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 -13)
-; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A10]], i32 0
-; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A11]], i32 1
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP8]], i32 -13)
-; AVX1-NEXT:    [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 -13)
-; AVX1-NEXT:    [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 -13)
-; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[A14]], i32 0
-; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP10]], float [[A15]], i32 1
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP11]], i32 -13)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R34:%.*]] = shufflevector <16 x float> [[R1]], <16 x float> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R34]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R73:%.*]] = shufflevector <16 x float> [[R5]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R73]], float [[C8]], i32 8
-; AVX1-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; AVX1-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R112:%.*]] = shufflevector <16 x float> [[R9]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R112]], float [[C12]], i32 12
-; AVX1-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; AVX1-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[R13]], <16 x float> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:    ret <16 x float> [[R151]]
-;
-; AVX2-LABEL: @buildvector_powi_16f32_n13(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX2-NEXT:    ret <16 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_16f32_n13(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX512-NEXT:    ret <16 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_16f32_n13(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1