Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -47,6 +47,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -1411,6 +1412,24 @@
     default:
       break;
 
+    case Intrinsic::powi:
+      if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
+        bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
+        if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
+                                               ShouldOptForSize)) {
+          // The cost is modeled on the expansion performed by ExpandPowI in
+          // SelectionDAGBuilder.
+          unsigned ActiveBits = RHSC->getValue().getActiveBits();
+          InstructionCost Cost =
+              ActiveBits * thisT()->getArithmeticInstrCost(Instruction::FMul,
+                                                           RetTy, CostKind);
+          if (RHSC->getSExtValue() < 0)
+            Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
+                                                    CostKind);
+          return Cost;
+        }
+      }
+      break;
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2196,6 +2196,18 @@
     return false;
   }
 
+  /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
+  /// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
+  /// considered beneficial.
+  /// If optimizing for size, expansion is only considered beneficial for upto
+  /// 5 multiplies and a divide (if the exponent is negative).
+  virtual bool isBeneficialToExpandPowI(unsigned int Exponent, bool OptForSize) const {
+    if ((int)Exponent < 0)
+      Exponent = -Exponent;
+    return !OptForSize ||
+           (countPopulation(Exponent) + Log2_32(Exponent) < 7);
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5346,38 +5346,36 @@
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
-  // If RHS is a constant, we can expand this out to a multiplication tree,
-  // otherwise we end up lowering to a call to __powidf2 (for example).  When
-  // optimizing for size, we only want to do this if the expansion would produce
-  // a small number of multiplies, otherwise we do the full expansion.
+  // If RHS is a constant, we can expand this out to a multiplication tree if
+  // it's beneficial on the target, otherwise we end up lowering to a call to
+  // __powidf2 (for example).
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-    // Get the exponent as a positive value.
     unsigned Val = RHSC->getSExtValue();
-    if ((int)Val < 0) Val = -Val;
 
     // powi(x, 0) -> 1.0
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
-    bool OptForSize = DAG.shouldOptForSize();
-    if (!OptForSize ||
-        // If optimizing for size, don't insert too many multiplies.
-        // This inserts up to 5 multiplies.
-        countPopulation(Val) + Log2_32(Val) < 7) {
+    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
+            Val, DAG.shouldOptForSize())) {
+      // Get the exponent as a positive value.
+      if ((int)Val < 0)
+        Val = -Val;
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
       // the benefit of being both really simple and much better than a libcall.
-      SDValue Res;  // Logically starts equal to 1.0
+      SDValue Res; // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
       // TODO: Intrinsics should have fast-math-flags that propagate to these
       // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
-            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+            Res =
+                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
           else
-            Res = CurSquare;  // 1.0*CurSquare.
+            Res = CurSquare; // 1.0*CurSquare.
         }
 
         CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
Index: llvm/test/Analysis/CostModel/X86/powi.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/powi.ll
+++ llvm/test/Analysis/CostModel/X86/powi.ll
@@ -74,55 +74,55 @@
 
 define i32 @powi_3() {
 ; SSE-LABEL: 'powi_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 3)
@@ -142,55 +142,55 @@
 
 define i32 @powi_n3() {
 ; SSE-LABEL: 'powi_n3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 412 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 532 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_n3'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_n3'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_n3'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 -3)
@@ -210,25 +210,25 @@
 
 define i32 @powi_6() {
 ; SSE-LABEL: 'powi_6'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_6'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
@@ -236,29 +236,29 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_6'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_6'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 6)
@@ -278,55 +278,55 @@
 
 define i32 @powi_16() {
 ; SSE-LABEL: 'powi_16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX1-LABEL: 'powi_16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX2-LABEL: 'powi_16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
 ; AVX512-LABEL: 'powi_16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
 ;
   %F32 = call float @llvm.powi.f32(float poison, i32 16)
Index: llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s
+
+define <2 x double> @PR53887_v2f64(<2 x double> noundef %x) {
+; CHECK-LABEL: @PR53887_v2f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP0]]
+;
+entry:
+  %vecext = extractelement <2 x double> %x, i64 0
+  %0 = tail call fast double @llvm.powi.f64.i32(double %vecext, i32 6)
+  %vecinit = insertelement <2 x double> undef, double %0, i64 0
+  %vecext1 = extractelement <2 x double> %x, i64 1
+  %1 = tail call fast double @llvm.powi.f64.i32(double %vecext1, i32 6)
+  %vecinit3 = insertelement <2 x double> %vecinit, double %1, i64 1
+  ret <2 x double> %vecinit3
+}
+
+define <4 x double> @PR53887_v4f64(<4 x double> noundef %x) local_unnamed_addr #0 {
+; CHECK-LABEL: @PR53887_v4f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT:    ret <4 x double> [[TMP0]]
+;
+entry:
+  %vecext = extractelement <4 x double> %x, i64 0
+  %0 = tail call fast double @llvm.powi.f64.i32(double %vecext, i32 6) #2
+  %vecinit = insertelement <4 x double> undef, double %0, i64 0
+  %vecext1 = extractelement <4 x double> %x, i64 1
+  %1 = tail call fast double @llvm.powi.f64.i32(double %vecext1, i32 6) #2
+  %vecinit3 = insertelement <4 x double> %vecinit, double %1, i64 1
+  %vecext4 = extractelement <4 x double> %x, i64 2
+  %2 = tail call fast double @llvm.powi.f64.i32(double %vecext4, i32 6) #2
+  %vecinit6 = insertelement <4 x double> %vecinit3, double %2, i64 2
+  %vecext7 = extractelement <4 x double> %x, i64 3
+  %3 = tail call fast double @llvm.powi.f64.i32(double %vecext7, i32 6) #2
+  %vecinit9 = insertelement <4 x double> %vecinit6, double %3, i64 3
+  ret <4 x double> %vecinit9
+}
+
+declare double @llvm.powi.f64.i32(double, i32)
Index: llvm/test/Transforms/SLPVectorizer/X86/powi.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powi.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powi.ll
@@ -1,18 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s
 
 define <2 x double> @buildvector_powi_2f64_6(<2 x double> %a) {
 ; CHECK-LABEL: @buildvector_powi_2f64_6(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6)
-; CHECK-NEXT:    [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6)
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[A:%.*]], i32 6)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -43,67 +38,9 @@
 }
 
 define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
-; SSE-LABEL: @buildvector_powi_4f32_3(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x float> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f32_3(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 3)
-; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 3)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3
-; AVX1-NEXT:    ret <4 x float> [[R3]]
-;
-; AVX2-LABEL: @buildvector_powi_4f32_3(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX2-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX2-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX2-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX2-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 3)
-; AVX2-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 3)
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3
-; AVX2-NEXT:    ret <4 x float> [[R3]]
-;
-; AVX512-LABEL: @buildvector_powi_4f32_3(
-; AVX512-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX512-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; AVX512-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; AVX512-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
-; AVX512-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
-; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[R31]]
+; CHECK-LABEL: @buildvector_powi_4f32_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[A:%.*]], i32 3)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -125,45 +62,9 @@
 ;
 
 define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
-; SSE-LABEL: @buildvector_powi_4f64_16(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX1-LABEL: @buildvector_powi_4f64_16(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
-; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R31]]
-;
-; AVX2-LABEL: @buildvector_powi_4f64_16(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX2-NEXT:    ret <4 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_4f64_16(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
-; AVX512-NEXT:    ret <4 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_4f64_16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -181,64 +82,9 @@
 }
 
 define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
-; SSE-LABEL: @buildvector_powi_8f32_4(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 4)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP9]], i32 4)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f32_4(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 4)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 4)
-; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 4)
-; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 4)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 4)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 4)
-; AVX1-NEXT:    [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 4)
-; AVX1-NEXT:    [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 4)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
-; AVX1-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX2-LABEL: @buildvector_powi_8f32_4(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX2-NEXT:    ret <8 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f32_4(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
-; AVX512-NEXT:    ret <8 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f32_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -272,61 +118,9 @@
 ;
 
 define <8 x double> @buildvector_powi_8f64_5(<8 x double> %a) {
-; SSE-LABEL: @buildvector_powi_8f64_5(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX1-LABEL: @buildvector_powi_8f64_5(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
-; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
-; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
-; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
-; AVX1-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
-; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
-; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
-; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
-; AVX1-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
-; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:    ret <8 x double> [[R71]]
-;
-; AVX2-LABEL: @buildvector_powi_8f64_5(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX2-NEXT:    ret <8 x double> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_8f64_5(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
-; AVX512-NEXT:    ret <8 x double> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_8f64_5(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -411,104 +205,9 @@
 }
 
 define <16 x float> @buildvector_powi_16f32_n13(<16 x float> %a) {
-; SSE-LABEL: @buildvector_powi_16f32_n13(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; SSE-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; SSE-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; SSE-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; SSE-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; SSE-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; SSE-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; SSE-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; SSE-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[A0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[A1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[A2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[A3]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[A4]], i32 4
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[A5]], i32 5
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[A6]], i32 6
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[A7]], i32 7
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP8]], i32 -13)
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[A8]], i32 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[A9]], i32 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[A10]], i32 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[A11]], i32 3
-; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[A12]], i32 4
-; SSE-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[A13]], i32 5
-; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[A14]], i32 6
-; SSE-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[A15]], i32 7
-; SSE-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP17]], i32 -13)
-; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP18]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:    ret <16 x float> [[R151]]
-;
-; AVX1-LABEL: @buildvector_powi_16f32_n13(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
-; AVX1-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
-; AVX1-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
-; AVX1-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
-; AVX1-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
-; AVX1-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
-; AVX1-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
-; AVX1-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
-; AVX1-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
-; AVX1-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
-; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 -13)
-; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 -13)
-; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 -13)
-; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 -13)
-; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 -13)
-; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 -13)
-; AVX1-NEXT:    [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 -13)
-; AVX1-NEXT:    [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 -13)
-; AVX1-NEXT:    [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 -13)
-; AVX1-NEXT:    [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 -13)
-; AVX1-NEXT:    [[C10:%.*]] = call float @llvm.powi.f32.i32(float [[A10]], i32 -13)
-; AVX1-NEXT:    [[C11:%.*]] = call float @llvm.powi.f32.i32(float [[A11]], i32 -13)
-; AVX1-NEXT:    [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 -13)
-; AVX1-NEXT:    [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 -13)
-; AVX1-NEXT:    [[C14:%.*]] = call float @llvm.powi.f32.i32(float [[A14]], i32 -13)
-; AVX1-NEXT:    [[C15:%.*]] = call float @llvm.powi.f32.i32(float [[A15]], i32 -13)
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[C0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
-; AVX1-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
-; AVX1-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; AVX1-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
-; AVX1-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
-; AVX1-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
-; AVX1-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; AVX1-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
-; AVX1-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
-; AVX1-NEXT:    ret <16 x float> [[R15]]
-;
-; AVX2-LABEL: @buildvector_powi_16f32_n13(
-; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX2-NEXT:    ret <16 x float> [[TMP1]]
-;
-; AVX512-LABEL: @buildvector_powi_16f32_n13(
-; AVX512-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
-; AVX512-NEXT:    ret <16 x float> [[TMP1]]
+; CHECK-LABEL: @buildvector_powi_16f32_n13(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1