Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10250,6 +10250,20 @@
   case ISD::BITCAST:
     // Don't handle bitcast here.
     return;
+  case ISD::FP_EXTEND: {
+    if (N->getValueType(0) != MVT::v4f64)
+      return;
+    SDValue Input = N->getOperand(0);
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input,
+                             DAG.getIntPtrConstant(0, dl));
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input,
+                             DAG.getIntPtrConstant(2, dl));
+    SDValue HiExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Hi);
+    SDValue LoExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Lo);
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, HiExt, LoExt);
+    Results.push_back(Concat);
+    return;
+  }
   }
 }
 
@@ -14944,12 +14958,13 @@
 }
 
 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
+  EVT ScalVT = VT.getScalarType();
 
-  if (!VT.isSimple())
+  // No FMA's for types that are not simple or are too wide.
+  if (!ScalVT.isSimple() || (!Subtarget.hasQPX() && VT.getSizeInBits() > 128))
     return false;
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (ScalVT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -83,6 +83,9 @@
   unsigned getCacheLineSize() const override;
   unsigned getPrefetchDistance() const override;
   unsigned getMaxInterleaveFactor(unsigned VF);
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+    return !OptSize;
+  }
   int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
Index: llvm/test/CodeGen/PowerPC/vec_fmuladd.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/vec_fmuladd.ll
+++ llvm/test/CodeGen/PowerPC/vec_fmuladd.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec \
+; RUN:   --enable-unsafe-fp-math < %s | FileCheck %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
Index: llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll
@@ -0,0 +1,290 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -S | FileCheck %s
+
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define dso_local float @test(float* noalias %thing1, float* noalias %thing2) #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP132:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP133:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP134:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP135:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP136:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP139:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP140:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP141:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP142:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI11:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP143:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION12:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INDUCTION13:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INDUCTION14:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INDUCTION15:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[INDUCTION16:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INDUCTION17:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[INDUCTION18:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[INDUCTION19:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEXT:    [[INDUCTION20:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 36, i32 37, i32 38, i32 39>
+; CHECK-NEXT:    [[INDUCTION21:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 40, i32 41, i32 42, i32 43>
+; CHECK-NEXT:    [[INDUCTION22:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 20
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 24
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 28
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 36
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 40
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 44
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[THING1:%.*]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 4
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, <4 x float>* [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 8
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast float* [[TMP40]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 12
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <4 x float>, <4 x float>* [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 16
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <4 x float>, <4 x float>* [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 20
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 24
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <4 x float>, <4 x float>* [[TMP49]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 28
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP51]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 32
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP53]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 36
+; CHECK-NEXT:    [[TMP55:%.*]] = bitcast float* [[TMP54]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP55]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 40
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast float* [[TMP56]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <4 x float>, <4 x float>* [[TMP57]], align 4
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 44
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast float* [[TMP58]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <4 x float>, <4 x float>* [[TMP59]], align 4
+; CHECK-NEXT:    [[TMP60:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP61:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP63:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP64:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP66:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP67:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP68:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP69:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP70:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[THING2:%.*]], i64 [[TMP60]]
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP61]]
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP62]]
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP63]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP64]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP65]]
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP66]]
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP67]]
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP68]]
+; CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP69]]
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP70]]
+; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP71]]
+; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = bitcast float* [[TMP84]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, <4 x float>* [[TMP85]], align 4
+; CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 4
+; CHECK-NEXT:    [[TMP87:%.*]] = bitcast float* [[TMP86]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD35:%.*]] = load <4 x float>, <4 x float>* [[TMP87]], align 4
+; CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 8
+; CHECK-NEXT:    [[TMP89:%.*]] = bitcast float* [[TMP88]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD36:%.*]] = load <4 x float>, <4 x float>* [[TMP89]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 12
+; CHECK-NEXT:    [[TMP91:%.*]] = bitcast float* [[TMP90]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD37:%.*]] = load <4 x float>, <4 x float>* [[TMP91]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 16
+; CHECK-NEXT:    [[TMP93:%.*]] = bitcast float* [[TMP92]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD38:%.*]] = load <4 x float>, <4 x float>* [[TMP93]], align 4
+; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 20
+; CHECK-NEXT:    [[TMP95:%.*]] = bitcast float* [[TMP94]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD39:%.*]] = load <4 x float>, <4 x float>* [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 24
+; CHECK-NEXT:    [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD40:%.*]] = load <4 x float>, <4 x float>* [[TMP97]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 28
+; CHECK-NEXT:    [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD41:%.*]] = load <4 x float>, <4 x float>* [[TMP99]], align 4
+; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 32
+; CHECK-NEXT:    [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD42:%.*]] = load <4 x float>, <4 x float>* [[TMP101]], align 4
+; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 36
+; CHECK-NEXT:    [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD43:%.*]] = load <4 x float>, <4 x float>* [[TMP103]], align 4
+; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 40
+; CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD44:%.*]] = load <4 x float>, <4 x float>* [[TMP105]], align 4
+; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 44
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD45:%.*]] = load <4 x float>, <4 x float>* [[TMP107]], align 4
+; CHECK-NEXT:    [[TMP108:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD34]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fmul fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]]
+; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast <4 x float> [[WIDE_LOAD24]], [[WIDE_LOAD36]]
+; CHECK-NEXT:    [[TMP111:%.*]] = fmul fast <4 x float> [[WIDE_LOAD25]], [[WIDE_LOAD37]]
+; CHECK-NEXT:    [[TMP112:%.*]] = fmul fast <4 x float> [[WIDE_LOAD26]], [[WIDE_LOAD38]]
+; CHECK-NEXT:    [[TMP113:%.*]] = fmul fast <4 x float> [[WIDE_LOAD27]], [[WIDE_LOAD39]]
+; CHECK-NEXT:    [[TMP114:%.*]] = fmul fast <4 x float> [[WIDE_LOAD28]], [[WIDE_LOAD40]]
+; CHECK-NEXT:    [[TMP115:%.*]] = fmul fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD41]]
+; CHECK-NEXT:    [[TMP116:%.*]] = fmul fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD42]]
+; CHECK-NEXT:    [[TMP117:%.*]] = fmul fast <4 x float> [[WIDE_LOAD31]], [[WIDE_LOAD43]]
+; CHECK-NEXT:    [[TMP118:%.*]] = fmul fast <4 x float> [[WIDE_LOAD32]], [[WIDE_LOAD44]]
+; CHECK-NEXT:    [[TMP119:%.*]] = fmul fast <4 x float> [[WIDE_LOAD33]], [[WIDE_LOAD45]]
+; CHECK-NEXT:    [[TMP120:%.*]] = fpext <4 x float> [[TMP108]] to <4 x double>
+; CHECK-NEXT:    [[TMP121:%.*]] = fpext <4 x float> [[TMP109]] to <4 x double>
+; CHECK-NEXT:    [[TMP122:%.*]] = fpext <4 x float> [[TMP110]] to <4 x double>
+; CHECK-NEXT:    [[TMP123:%.*]] = fpext <4 x float> [[TMP111]] to <4 x double>
+; CHECK-NEXT:    [[TMP124:%.*]] = fpext <4 x float> [[TMP112]] to <4 x double>
+; CHECK-NEXT:    [[TMP125:%.*]] = fpext <4 x float> [[TMP113]] to <4 x double>
+; CHECK-NEXT:    [[TMP126:%.*]] = fpext <4 x float> [[TMP114]] to <4 x double>
+; CHECK-NEXT:    [[TMP127:%.*]] = fpext <4 x float> [[TMP115]] to <4 x double>
+; CHECK-NEXT:    [[TMP128:%.*]] = fpext <4 x float> [[TMP116]] to <4 x double>
+; CHECK-NEXT:    [[TMP129:%.*]] = fpext <4 x float> [[TMP117]] to <4 x double>
+; CHECK-NEXT:    [[TMP130:%.*]] = fpext <4 x float> [[TMP118]] to <4 x double>
+; CHECK-NEXT:    [[TMP131:%.*]] = fpext <4 x float> [[TMP119]] to <4 x double>
+; CHECK-NEXT:    [[TMP132]] = fadd fast <4 x double> [[VEC_PHI]], [[TMP120]]
+; CHECK-NEXT:    [[TMP133]] = fadd fast <4 x double> [[VEC_PHI1]], [[TMP121]]
+; CHECK-NEXT:    [[TMP134]] = fadd fast <4 x double> [[VEC_PHI2]], [[TMP122]]
+; CHECK-NEXT:    [[TMP135]] = fadd fast <4 x double> [[VEC_PHI3]], [[TMP123]]
+; CHECK-NEXT:    [[TMP136]] = fadd fast <4 x double> [[VEC_PHI4]], [[TMP124]]
+; CHECK-NEXT:    [[TMP137]] = fadd fast <4 x double> [[VEC_PHI5]], [[TMP125]]
+; CHECK-NEXT:    [[TMP138]] = fadd fast <4 x double> [[VEC_PHI6]], [[TMP126]]
+; CHECK-NEXT:    [[TMP139]] = fadd fast <4 x double> [[VEC_PHI7]], [[TMP127]]
+; CHECK-NEXT:    [[TMP140]] = fadd fast <4 x double> [[VEC_PHI8]], [[TMP128]]
+; CHECK-NEXT:    [[TMP141]] = fadd fast <4 x double> [[VEC_PHI9]], [[TMP129]]
+; CHECK-NEXT:    [[TMP142]] = fadd fast <4 x double> [[VEC_PHI10]], [[TMP130]]
+; CHECK-NEXT:    [[TMP143]] = fadd fast <4 x double> [[VEC_PHI11]], [[TMP131]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 48
+; CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i32 [[INDEX_NEXT]], 288
+; CHECK-NEXT:    br i1 [[TMP144]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP133]], [[TMP132]]
+; CHECK-NEXT:    [[BIN_RDX46:%.*]] = fadd fast <4 x double> [[TMP134]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX47:%.*]] = fadd fast <4 x double> [[TMP135]], [[BIN_RDX46]]
+; CHECK-NEXT:    [[BIN_RDX48:%.*]] = fadd fast <4 x double> [[TMP136]], [[BIN_RDX47]]
+; CHECK-NEXT:    [[BIN_RDX49:%.*]] = fadd fast <4 x double> [[TMP137]], [[BIN_RDX48]]
+; CHECK-NEXT:    [[BIN_RDX50:%.*]] = fadd fast <4 x double> [[TMP138]], [[BIN_RDX49]]
+; CHECK-NEXT:    [[BIN_RDX51:%.*]] = fadd fast <4 x double> [[TMP139]], [[BIN_RDX50]]
+; CHECK-NEXT:    [[BIN_RDX52:%.*]] = fadd fast <4 x double> [[TMP140]], [[BIN_RDX51]]
+; CHECK-NEXT:    [[BIN_RDX53:%.*]] = fadd fast <4 x double> [[TMP141]], [[BIN_RDX52]]
+; CHECK-NEXT:    [[BIN_RDX54:%.*]] = fadd fast <4 x double> [[TMP142]], [[BIN_RDX53]]
+; CHECK-NEXT:    [[BIN_RDX55:%.*]] = fadd fast <4 x double> [[TMP143]], [[BIN_RDX54]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[BIN_RDX55]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX56:%.*]] = fadd fast <4 x double> [[BIN_RDX55]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF57:%.*]] = shufflevector <4 x double> [[BIN_RDX56]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX58:%.*]] = fadd fast <4 x double> [[BIN_RDX56]], [[RDX_SHUF57]]
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <4 x double> [[BIN_RDX58]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 300, 288
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 288, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[AGGR_PROD_02:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[I_01:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I_01]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP146:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[I_01]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP147:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP146]], [[TMP147]]
+; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD]] = fadd fast double [[AGGR_PROD_02]], [[CONV]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 300
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    [[AGGR_PROD_0_LCSSA:%.*]] = phi double [ [[ADD]], [[FOR_INC]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CONV3:%.*]] = fptrunc double [[AGGR_PROD_0_LCSSA]] to float
+; CHECK-NEXT:    ret float [[CONV3]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %aggr_prod.02 = phi double [ 0.000000e+00, %entry ], [ %add, %for.inc ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float, float* %thing1, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %idxprom1 = sext i32 %i.01 to i64
+  %arrayidx2 = getelementptr inbounds float, float* %thing2, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %mul = fmul fast float %0, %1
+  %conv = fpext float %mul to double
+  %add = fadd fast double %aggr_prod.02, %conv
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 300
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %aggr_prod.0.lcssa = phi double [ %add, %for.inc ]
+  %conv3 = fptrunc double %aggr_prod.0.lcssa to float
+  ret float %conv3
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+attributes #0 = { nounwind }
Index: llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
+++ llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx \
+; RUN:   -force-vector-interleave=1 -S | FileCheck %s
 
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -31,8 +32,8 @@
   %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
   ret i32 %count.0.lcssa
 
-; CHECK: load <4 x i8>
-; CHECK: icmp slt <4 x i8>
+; CHECK: load <16 x i8>
+; CHECK: icmp slt <16 x i8>
 }
 
 
@@ -66,8 +67,8 @@
   ret i16 %count.0.lcssa
 
 ; CHECK-LABEL: foo2
-; CHECK: load <8 x i8>
-; CHECK: icmp slt <8 x i8>
+; CHECK: load <16 x i8>
+; CHECK: icmp slt <16 x i8>
 }
 
 define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
@@ -100,8 +101,8 @@
   ret i32 %count.0.lcssa
 
 ; CHECK-LABEL: foo3
-; CHECK: load <4 x i16>
-; CHECK: icmp slt <4 x i16>
+; CHECK: load <8 x i16>
+; CHECK: icmp slt <8 x i16>
 }
 
 define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
@@ -134,7 +135,7 @@
   ret i64 %count.0.lcssa
 
 ; CHECK-LABEL: foo4
-; CHECK: load <2 x i16>
-; CHECK: icmp slt <2 x i16>
+; CHECK: load <8 x i16>
+; CHECK: icmp slt <8 x i16>
 }