Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10250,6 +10250,20 @@ case ISD::BITCAST: // Don't handle bitcast here. return; + case ISD::FP_EXTEND: { + if (N->getValueType(0) != MVT::v4f64) + return; + SDValue Input = N->getOperand(0); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input, + DAG.getIntPtrConstant(0, dl)); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input, + DAG.getIntPtrConstant(2, dl)); + SDValue HiExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Hi); + SDValue LoExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Lo); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, HiExt, LoExt); + Results.push_back(Concat); + return; + } } } @@ -14944,12 +14958,13 @@ } bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - VT = VT.getScalarType(); + EVT ScalVT = VT.getScalarType(); - if (!VT.isSimple()) + // No FMA's for types that are not simple or are too wide. + if (!ScalVT.isSimple() || (!Subtarget.hasQPX() && VT.getSizeInBits() > 128)) return false; - switch (VT.getSimpleVT().SimpleTy) { + switch (ScalVT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -83,6 +83,9 @@ unsigned getCacheLineSize() const override; unsigned getPrefetchDistance() const override; unsigned getMaxInterleaveFactor(unsigned VF); + bool shouldMaximizeVectorBandwidth(bool OptSize) const { + return !OptSize; + } int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, Index: llvm/test/CodeGen/PowerPC/vec_fmuladd.ll =================================================================== --- llvm/test/CodeGen/PowerPC/vec_fmuladd.ll +++ llvm/test/CodeGen/PowerPC/vec_fmuladd.ll @@ -1,4 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec \ +; RUN: --enable-unsafe-fp-math < %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" target triple = "powerpc64-unknown-linux-gnu" Index: llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll @@ -0,0 +1,290 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -S | FileCheck %s + +target triple = "powerpc64le-unknown-linux-gnu" + +define dso_local float @test(float* noalias %thing1, float* noalias %thing2) #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP132:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP133:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP134:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP135:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP136:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP139:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP140:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP141:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP142:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP143:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION12:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION13:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION14:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION15:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION16:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION17:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION18:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION19:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION20:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION21:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION22:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 12 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 20 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 24 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 28 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 36 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 40 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 44 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[THING1:%.*]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, <4 x float>* [[TMP39]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 8 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP41]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 12 +; CHECK-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <4 x float>, <4 x float>* [[TMP43]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 16 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <4 x float>, <4 x float>* [[TMP45]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 20 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 24 +; CHECK-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <4 x float>, <4 x float>* [[TMP49]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 28 +; CHECK-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP51]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 32 +; CHECK-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP53]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 36 +; CHECK-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP55]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 40 +; CHECK-NEXT: [[TMP57:%.*]] = bitcast float* [[TMP56]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <4 x float>, <4 x float>* [[TMP57]], align 4 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 44 +; CHECK-NEXT: [[TMP59:%.*]] = bitcast float* [[TMP58]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <4 x float>, <4 x float>* [[TMP59]], align 4 +; CHECK-NEXT: [[TMP60:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP61:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP62:%.*]] = sext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP63:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP64:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP66:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP67:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP68:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP69:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP70:%.*]] = sext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP71:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[THING2:%.*]], i64 [[TMP60]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP61]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP62]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP63]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP64]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP65]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP66]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP67]] +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP68]] +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP69]] +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP70]] +; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[TMP71]] +; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = bitcast float* [[TMP84]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <4 x float>, <4 x float>* [[TMP85]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 4 +; CHECK-NEXT: [[TMP87:%.*]] = bitcast float* [[TMP86]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD35:%.*]] = load <4 x float>, <4 x float>* [[TMP87]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 8 +; CHECK-NEXT: [[TMP89:%.*]] = bitcast float* [[TMP88]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <4 x float>, <4 x float>* [[TMP89]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 12 +; CHECK-NEXT: [[TMP91:%.*]] = bitcast float* [[TMP90]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD37:%.*]] = load <4 x float>, <4 x float>* [[TMP91]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 16 +; CHECK-NEXT: [[TMP93:%.*]] = bitcast float* [[TMP92]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD38:%.*]] = load <4 x float>, <4 x float>* [[TMP93]], align 4 +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 20 +; CHECK-NEXT: [[TMP95:%.*]] = bitcast float* [[TMP94]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD39:%.*]] = load <4 x float>, <4 x float>* [[TMP95]], align 4 +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 24 +; CHECK-NEXT: [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD40:%.*]] = load <4 x float>, <4 x float>* [[TMP97]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 28 +; CHECK-NEXT: [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD41:%.*]] = load <4 x float>, <4 x float>* [[TMP99]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 32 +; CHECK-NEXT: [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD42:%.*]] = load <4 x float>, <4 x float>* [[TMP101]], align 4 +; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 36 +; CHECK-NEXT: [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD43:%.*]] = load <4 x float>, <4 x float>* [[TMP103]], align 4 +; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 40 +; CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD44:%.*]] = load <4 x float>, <4 x float>* [[TMP105]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 44 +; CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD45:%.*]] = load <4 x float>, <4 x float>* [[TMP107]], align 4 +; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD34]] +; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]] +; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <4 x float> [[WIDE_LOAD24]], [[WIDE_LOAD36]] +; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <4 x float> [[WIDE_LOAD25]], [[WIDE_LOAD37]] +; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <4 x float> [[WIDE_LOAD26]], [[WIDE_LOAD38]] +; CHECK-NEXT: [[TMP113:%.*]] = fmul fast <4 x float> [[WIDE_LOAD27]], [[WIDE_LOAD39]] +; CHECK-NEXT: [[TMP114:%.*]] = fmul fast <4 x float> [[WIDE_LOAD28]], [[WIDE_LOAD40]] +; CHECK-NEXT: [[TMP115:%.*]] = fmul fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD41]] +; CHECK-NEXT: [[TMP116:%.*]] = fmul fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD42]] +; CHECK-NEXT: [[TMP117:%.*]] = fmul fast <4 x float> [[WIDE_LOAD31]], [[WIDE_LOAD43]] +; CHECK-NEXT: [[TMP118:%.*]] = fmul fast <4 x float> [[WIDE_LOAD32]], [[WIDE_LOAD44]] +; CHECK-NEXT: [[TMP119:%.*]] = fmul fast <4 x float> [[WIDE_LOAD33]], [[WIDE_LOAD45]] +; CHECK-NEXT: [[TMP120:%.*]] = fpext <4 x float> [[TMP108]] to <4 x double> +; CHECK-NEXT: [[TMP121:%.*]] = fpext <4 x float> [[TMP109]] to <4 x double> +; CHECK-NEXT: [[TMP122:%.*]] = fpext <4 x float> [[TMP110]] to <4 x double> +; CHECK-NEXT: [[TMP123:%.*]] = fpext <4 x float> [[TMP111]] to <4 x double> +; CHECK-NEXT: [[TMP124:%.*]] = fpext <4 x float> [[TMP112]] to <4 x double> +; CHECK-NEXT: [[TMP125:%.*]] = fpext <4 x float> [[TMP113]] to <4 x double> +; CHECK-NEXT: [[TMP126:%.*]] = fpext <4 x float> [[TMP114]] to <4 x double> +; CHECK-NEXT: [[TMP127:%.*]] = fpext <4 x float> [[TMP115]] to <4 x double> +; CHECK-NEXT: [[TMP128:%.*]] = fpext <4 x float> [[TMP116]] to <4 x double> +; CHECK-NEXT: [[TMP129:%.*]] = fpext <4 x float> [[TMP117]] to <4 x double> +; CHECK-NEXT: [[TMP130:%.*]] = fpext <4 x float> [[TMP118]] to <4 x double> +; CHECK-NEXT: [[TMP131:%.*]] = fpext <4 x float> [[TMP119]] to <4 x double> +; CHECK-NEXT: [[TMP132]] = fadd fast <4 x double> [[VEC_PHI]], [[TMP120]] +; CHECK-NEXT: [[TMP133]] = fadd fast <4 x double> [[VEC_PHI1]], [[TMP121]] +; CHECK-NEXT: [[TMP134]] = fadd fast <4 x double> [[VEC_PHI2]], [[TMP122]] +; CHECK-NEXT: [[TMP135]] = fadd fast <4 x double> [[VEC_PHI3]], [[TMP123]] +; CHECK-NEXT: [[TMP136]] = fadd fast <4 x double> [[VEC_PHI4]], [[TMP124]] +; CHECK-NEXT: [[TMP137]] = fadd fast <4 x double> [[VEC_PHI5]], [[TMP125]] +; CHECK-NEXT: [[TMP138]] = fadd fast <4 x double> [[VEC_PHI6]], [[TMP126]] +; CHECK-NEXT: [[TMP139]] = fadd fast <4 x double> [[VEC_PHI7]], [[TMP127]] +; CHECK-NEXT: [[TMP140]] = fadd fast <4 x double> [[VEC_PHI8]], [[TMP128]] +; CHECK-NEXT: [[TMP141]] = fadd fast <4 x double> [[VEC_PHI9]], [[TMP129]] +; CHECK-NEXT: [[TMP142]] = fadd fast <4 x double> [[VEC_PHI10]], [[TMP130]] +; CHECK-NEXT: [[TMP143]] = fadd fast <4 x double> [[VEC_PHI11]], [[TMP131]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 48 +; CHECK-NEXT: [[TMP144:%.*]] = icmp eq i32 [[INDEX_NEXT]], 288 +; CHECK-NEXT: br i1 [[TMP144]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP133]], [[TMP132]] +; CHECK-NEXT: [[BIN_RDX46:%.*]] = fadd fast <4 x double> [[TMP134]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX47:%.*]] = fadd fast <4 x double> [[TMP135]], [[BIN_RDX46]] +; CHECK-NEXT: [[BIN_RDX48:%.*]] = fadd fast <4 x double> [[TMP136]], [[BIN_RDX47]] +; CHECK-NEXT: [[BIN_RDX49:%.*]] = fadd fast <4 x double> [[TMP137]], [[BIN_RDX48]] +; CHECK-NEXT: [[BIN_RDX50:%.*]] = fadd fast <4 x double> [[TMP138]], [[BIN_RDX49]] +; CHECK-NEXT: [[BIN_RDX51:%.*]] = fadd fast <4 x double> [[TMP139]], [[BIN_RDX50]] +; CHECK-NEXT: [[BIN_RDX52:%.*]] = fadd fast <4 x double> [[TMP140]], [[BIN_RDX51]] +; CHECK-NEXT: [[BIN_RDX53:%.*]] = fadd fast <4 x double> [[TMP141]], [[BIN_RDX52]] +; CHECK-NEXT: [[BIN_RDX54:%.*]] = fadd fast <4 x double> [[TMP142]], [[BIN_RDX53]] +; CHECK-NEXT: [[BIN_RDX55:%.*]] = fadd fast <4 x double> [[TMP143]], [[BIN_RDX54]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[BIN_RDX55]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX56:%.*]] = fadd fast <4 x double> [[BIN_RDX55]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF57:%.*]] = shufflevector <4 x double> [[BIN_RDX56]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX58:%.*]] = fadd fast <4 x double> [[BIN_RDX56]], [[RDX_SHUF57]] +; CHECK-NEXT: [[TMP145:%.*]] = extractelement <4 x double> [[BIN_RDX58]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 300, 288 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 288, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[AGGR_PROD_02:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[I_01:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I_01]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[THING1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP146:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[I_01]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[THING2]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP147:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP146]], [[TMP147]] +; CHECK-NEXT: [[CONV:%.*]] = fpext float [[MUL]] to double +; CHECK-NEXT: [[ADD]] = fadd fast double [[AGGR_PROD_02]], [[CONV]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_01]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 300 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !2 +; CHECK: for.end: +; CHECK-NEXT: [[AGGR_PROD_0_LCSSA:%.*]] = phi double [ [[ADD]], [[FOR_INC]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CONV3:%.*]] = fptrunc double [[AGGR_PROD_0_LCSSA]] to float +; CHECK-NEXT: ret float [[CONV3]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %aggr_prod.02 = phi double [ 0.000000e+00, %entry ], [ %add, %for.inc ] + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %idxprom = sext i32 %i.01 to i64 + %arrayidx = getelementptr inbounds float, float* %thing1, i64 %idxprom + %0 = load float, float* %arrayidx, align 4 + %idxprom1 = sext i32 %i.01 to i64 + %arrayidx2 = getelementptr inbounds float, float* %thing2, i64 %idxprom1 + %1 = load float, float* %arrayidx2, align 4 + %mul = fmul fast float %0, %1 + %conv = fpext float %mul to double + %add = fadd fast double %aggr_prod.02, %conv + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.01, 1 + %cmp = icmp slt i32 %inc, 300 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.inc + %aggr_prod.0.lcssa = phi double [ %add, %for.inc ] + %conv3 = fptrunc double %aggr_prod.0.lcssa to float + ret float %conv3 +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +attributes #0 = { nounwind } Index: llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx \ +; RUN: -force-vector-interleave=1 -S | FileCheck %s target triple = "powerpc64-unknown-linux-gnu" @@ -31,8 +32,8 @@ %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ] ret i32 %count.0.lcssa -; CHECK: load <4 x i8> -; CHECK: icmp slt <4 x i8> +; CHECK: load <16 x i8> +; CHECK: icmp slt <16 x i8> } @@ -66,8 +67,8 @@ ret i16 %count.0.lcssa ; CHECK-LABEL: foo2 -; CHECK: load <8 x i8> -; CHECK: icmp slt <8 x i8> +; CHECK: load <16 x i8> +; CHECK: icmp slt <16 x i8> } define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) { @@ -100,8 +101,8 @@ ret i32 %count.0.lcssa ; CHECK-LABEL: foo3 -; CHECK: load <4 x i16> -; CHECK: icmp slt <4 x i16> +; CHECK: load <8 x i16> +; CHECK: icmp slt <8 x i16> } define i64 @foo4(i16* readonly %ptr, i32 signext %l) { @@ -134,7 +135,7 @@ ret i64 %count.0.lcssa ; CHECK-LABEL: foo4 -; CHECK: load <2 x i16> -; CHECK: icmp slt <2 x i16> +; CHECK: load <8 x i16> +; CHECK: icmp slt <8 x i16> }