This is an archive of the discontinued LLVM Phabricator instance.

Differential D70015

[PowerPC] Improve vectorization of loops that operate on values that are extended in the body
Changes PlannedPublic

Authored by nemanjai on Nov 8 2019, 10:14 AM.

Download Raw Diff

Details

Reviewers

hfinkel

Group Reviewers

Restricted Project

Summary

When vectorizing loops that operate on values that start narrower and are extended in the loop, we don't maximize the vector throughput and overall do a poor job of vectorizing.
Example:

double test(float *__restrict thing1, float *__restrict thing2) {
  int i = 0;
  double aggr_prod = 0.0;

  for (i = 0; i < 300; i++) {
    aggr_prod += (thing1[i] * thing2[i]);
  }

  return aggr_prod;
}

We will currently only vectorize this by a factor of 2, then extend early and perform FMA's for the computation. However, it is much faster to:

Vectorize by a factor of 4
Perform the multiplication in single precision
Extend the result of the multiplication and do the addition

This patch improves performance of an important kernel by 50% which in turn provides a very significant improvement on the benchmark that contains the kernel. It also does not have a detrimental effect on performance of other benchmarks as measured by SPEC results.

Diff Detail

Repository: rL LLVM

Event Timeline

nemanjai created this revision.Nov 8 2019, 10:14 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 8 2019, 10:14 AM

Herald added subscribers: shchenz, jsji, kbarton, hiraditya. · View Herald Transcript

I need to add some additional pieces to this.

Herald added a subscriber: • wuzish. · View Herald TranscriptNov 19 2019, 5:20 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

PowerPC/

PPCISelLowering.cpp

21 lines

PPCTargetTransformInfo.h

3 lines

test/

CodeGen/

PowerPC/

vec_fmuladd.ll

3 lines

Transforms/

LoopVectorize/

PowerPC/

max-vec-bandwidth.ll

290 lines

pr30990.ll

19 lines

Diff 228482

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,244 Lines • ▼ Show 20 Lines	if (TrgVT.isVector() &&
OpVT.getSizeInBits() <= 128 &&		OpVT.getSizeInBits() <= 128 &&
isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))		isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));		Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
return;		return;
}		}
case ISD::BITCAST:		case ISD::BITCAST:
// Don't handle bitcast here.		// Don't handle bitcast here.
return;		return;
		case ISD::FP_EXTEND: {
		if (N->getValueType(0) != MVT::v4f64)
		return;
		SDValue Input = N->getOperand(0);
		SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input,
		DAG.getIntPtrConstant(0, dl));
		SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Input,
		DAG.getIntPtrConstant(2, dl));
		SDValue HiExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Hi);
		SDValue LoExt = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Lo);
		SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, HiExt, LoExt);
		Results.push_back(Concat);
		return;
		}
}		}
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Other Lowering Code		// Other Lowering Code
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {		static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
▲ Show 20 Lines • Show All 4,678 Lines • ▼ Show 20 Lines	bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

if (Fast)		if (Fast)
*Fast = true;		*Fast = true;

return true;		return true;
}		}

bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {		bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();		EVT ScalVT = VT.getScalarType();

if (!VT.isSimple())		// No FMA's for types that are not simple or are too wide.
		if (!ScalVT.isSimple() \|\| (!Subtarget.hasQPX() && VT.getSizeInBits() > 128))
return false;		return false;

switch (VT.getSimpleVT().SimpleTy) {		switch (ScalVT.getSimpleVT().SimpleTy) {
case MVT::f32:		case MVT::f32:
case MVT::f64:		case MVT::f64:
return true;		return true;
case MVT::f128:		case MVT::f128:
return (EnableQuadPrecision && Subtarget.hasP9Vector());		return (EnableQuadPrecision && Subtarget.hasP9Vector());
default:		default:
break;		break;
}		}
▲ Show 20 Lines • Show All 610 Lines • Show Last 20 Lines

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h

Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	public:
};		};
unsigned getNumberOfRegisters(unsigned ClassID) const;		unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;		unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
const char* getRegisterClassName(unsigned ClassID) const;		const char* getRegisterClassName(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;		unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize() const override;		unsigned getCacheLineSize() const override;
unsigned getPrefetchDistance() const override;		unsigned getPrefetchDistance() const override;
unsigned getMaxInterleaveFactor(unsigned VF);		unsigned getMaxInterleaveFactor(unsigned VF);
		bool shouldMaximizeVectorBandwidth(bool OptSize) const {
		return !OptSize;
		}
int vectorCostAdjustment(int Cost, unsigned Opcode, Type Ty1, Type Ty2);		int vectorCostAdjustment(int Cost, unsigned Opcode, Type Ty1, Type Ty2);
int getArithmeticInstrCost(		int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,		unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,		TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,		TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value > Args = ArrayRef<const Value >());		ArrayRef<const Value > Args = ArrayRef<const Value >());
Show All 22 Lines

llvm/test/CodeGen/PowerPC/vec_fmuladd.ll

	; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s \| FileCheck %s			; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec \
				; RUN: --enable-unsafe-fp-math < %s \| FileCheck %s

	target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"			target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
	target triple = "powerpc64-unknown-linux-gnu"			target triple = "powerpc64-unknown-linux-gnu"

	declare <2 x float> @llvm.fmuladd.v2f32(<2 x float> %val, <2 x float>, <2 x float>)			declare <2 x float> @llvm.fmuladd.v2f32(<2 x float> %val, <2 x float>, <2 x float>)
	declare <4 x float> @llvm.fmuladd.v4f32(<4 x float> %val, <4 x float>, <4 x float>)			declare <4 x float> @llvm.fmuladd.v4f32(<4 x float> %val, <4 x float>, <4 x float>)
	declare <8 x float> @llvm.fmuladd.v8f32(<8 x float> %val, <8 x float>, <8 x float>)			declare <8 x float> @llvm.fmuladd.v8f32(<8 x float> %val, <8 x float>, <8 x float>)
	declare <2 x double> @llvm.fmuladd.v2f64(<2 x double> %val, <2 x double>, <2 x double>)			declare <2 x double> @llvm.fmuladd.v2f64(<2 x double> %val, <2 x double>, <2 x double>)
	▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -S \| FileCheck %s

				target triple = "powerpc64le-unknown-linux-gnu"

				define dso_local float @test(float* noalias %thing1, float* noalias %thing2) #0 {
				; CHECK-LABEL: @test(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP132:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI1:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP133:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI2:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP134:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI3:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP135:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI4:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP136:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI5:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI6:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI7:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP139:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI8:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP140:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI9:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP141:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI10:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP142:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_PHI11:%.]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP143:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
				; CHECK-NEXT: [[INDUCTION12:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[INDUCTION13:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 8, i32 9, i32 10, i32 11>
				; CHECK-NEXT: [[INDUCTION14:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 12, i32 13, i32 14, i32 15>
				; CHECK-NEXT: [[INDUCTION15:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 16, i32 17, i32 18, i32 19>
				; CHECK-NEXT: [[INDUCTION16:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 20, i32 21, i32 22, i32 23>
				; CHECK-NEXT: [[INDUCTION17:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 24, i32 25, i32 26, i32 27>
				; CHECK-NEXT: [[INDUCTION18:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 28, i32 29, i32 30, i32 31>
				; CHECK-NEXT: [[INDUCTION19:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 32, i32 33, i32 34, i32 35>
				; CHECK-NEXT: [[INDUCTION20:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 36, i32 37, i32 38, i32 39>
				; CHECK-NEXT: [[INDUCTION21:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 40, i32 41, i32 42, i32 43>
				; CHECK-NEXT: [[INDUCTION22:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 44, i32 45, i32 46, i32 47>
				; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
				; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 12
				; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 16
				; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 20
				; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 24
				; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 28
				; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 32
				; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 36
				; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 40
				; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 44
				; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP0]] to i64
				; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP1]] to i64
				; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP2]] to i64
				; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP3]] to i64
				; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP4]] to i64
				; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP5]] to i64
				; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP6]] to i64
				; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP7]] to i64
				; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP8]] to i64
				; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP9]] to i64
				; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[TMP10]] to i64
				; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[TMP11]] to i64
				; CHECK-NEXT: [[TMP24:%.]] = getelementptr inbounds float, float [[THING1:%.*]], i64 [[TMP12]]
				; CHECK-NEXT: [[TMP25:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP13]]
				; CHECK-NEXT: [[TMP26:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP14]]
				; CHECK-NEXT: [[TMP27:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP15]]
				; CHECK-NEXT: [[TMP28:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP16]]
				; CHECK-NEXT: [[TMP29:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP17]]
				; CHECK-NEXT: [[TMP30:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP18]]
				; CHECK-NEXT: [[TMP31:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP19]]
				; CHECK-NEXT: [[TMP32:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP20]]
				; CHECK-NEXT: [[TMP33:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP21]]
				; CHECK-NEXT: [[TMP34:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP22]]
				; CHECK-NEXT: [[TMP35:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[TMP23]]
				; CHECK-NEXT: [[TMP36:%.]] = getelementptr inbounds float, float [[TMP24]], i32 0
				; CHECK-NEXT: [[TMP37:%.]] = bitcast float [[TMP36]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float> [[TMP37]], align 4
				; CHECK-NEXT: [[TMP38:%.]] = getelementptr inbounds float, float [[TMP24]], i32 4
				; CHECK-NEXT: [[TMP39:%.]] = bitcast float [[TMP38]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD23:%.]] = load <4 x float>, <4 x float> [[TMP39]], align 4
				; CHECK-NEXT: [[TMP40:%.]] = getelementptr inbounds float, float [[TMP24]], i32 8
				; CHECK-NEXT: [[TMP41:%.]] = bitcast float [[TMP40]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD24:%.]] = load <4 x float>, <4 x float> [[TMP41]], align 4
				; CHECK-NEXT: [[TMP42:%.]] = getelementptr inbounds float, float [[TMP24]], i32 12
				; CHECK-NEXT: [[TMP43:%.]] = bitcast float [[TMP42]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD25:%.]] = load <4 x float>, <4 x float> [[TMP43]], align 4
				; CHECK-NEXT: [[TMP44:%.]] = getelementptr inbounds float, float [[TMP24]], i32 16
				; CHECK-NEXT: [[TMP45:%.]] = bitcast float [[TMP44]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD26:%.]] = load <4 x float>, <4 x float> [[TMP45]], align 4
				; CHECK-NEXT: [[TMP46:%.]] = getelementptr inbounds float, float [[TMP24]], i32 20
				; CHECK-NEXT: [[TMP47:%.]] = bitcast float [[TMP46]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD27:%.]] = load <4 x float>, <4 x float> [[TMP47]], align 4
				; CHECK-NEXT: [[TMP48:%.]] = getelementptr inbounds float, float [[TMP24]], i32 24
				; CHECK-NEXT: [[TMP49:%.]] = bitcast float [[TMP48]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD28:%.]] = load <4 x float>, <4 x float> [[TMP49]], align 4
				; CHECK-NEXT: [[TMP50:%.]] = getelementptr inbounds float, float [[TMP24]], i32 28
				; CHECK-NEXT: [[TMP51:%.]] = bitcast float [[TMP50]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD29:%.]] = load <4 x float>, <4 x float> [[TMP51]], align 4
				; CHECK-NEXT: [[TMP52:%.]] = getelementptr inbounds float, float [[TMP24]], i32 32
				; CHECK-NEXT: [[TMP53:%.]] = bitcast float [[TMP52]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD30:%.]] = load <4 x float>, <4 x float> [[TMP53]], align 4
				; CHECK-NEXT: [[TMP54:%.]] = getelementptr inbounds float, float [[TMP24]], i32 36
				; CHECK-NEXT: [[TMP55:%.]] = bitcast float [[TMP54]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD31:%.]] = load <4 x float>, <4 x float> [[TMP55]], align 4
				; CHECK-NEXT: [[TMP56:%.]] = getelementptr inbounds float, float [[TMP24]], i32 40
				; CHECK-NEXT: [[TMP57:%.]] = bitcast float [[TMP56]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD32:%.]] = load <4 x float>, <4 x float> [[TMP57]], align 4
				; CHECK-NEXT: [[TMP58:%.]] = getelementptr inbounds float, float [[TMP24]], i32 44
				; CHECK-NEXT: [[TMP59:%.]] = bitcast float [[TMP58]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD33:%.]] = load <4 x float>, <4 x float> [[TMP59]], align 4
				; CHECK-NEXT: [[TMP60:%.*]] = sext i32 [[TMP0]] to i64
				; CHECK-NEXT: [[TMP61:%.*]] = sext i32 [[TMP1]] to i64
				; CHECK-NEXT: [[TMP62:%.*]] = sext i32 [[TMP2]] to i64
				; CHECK-NEXT: [[TMP63:%.*]] = sext i32 [[TMP3]] to i64
				; CHECK-NEXT: [[TMP64:%.*]] = sext i32 [[TMP4]] to i64
				; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[TMP5]] to i64
				; CHECK-NEXT: [[TMP66:%.*]] = sext i32 [[TMP6]] to i64
				; CHECK-NEXT: [[TMP67:%.*]] = sext i32 [[TMP7]] to i64
				; CHECK-NEXT: [[TMP68:%.*]] = sext i32 [[TMP8]] to i64
				; CHECK-NEXT: [[TMP69:%.*]] = sext i32 [[TMP9]] to i64
				; CHECK-NEXT: [[TMP70:%.*]] = sext i32 [[TMP10]] to i64
				; CHECK-NEXT: [[TMP71:%.*]] = sext i32 [[TMP11]] to i64
				; CHECK-NEXT: [[TMP72:%.]] = getelementptr inbounds float, float [[THING2:%.*]], i64 [[TMP60]]
				; CHECK-NEXT: [[TMP73:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP61]]
				; CHECK-NEXT: [[TMP74:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP62]]
				; CHECK-NEXT: [[TMP75:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP63]]
				; CHECK-NEXT: [[TMP76:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP64]]
				; CHECK-NEXT: [[TMP77:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP65]]
				; CHECK-NEXT: [[TMP78:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP66]]
				; CHECK-NEXT: [[TMP79:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP67]]
				; CHECK-NEXT: [[TMP80:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP68]]
				; CHECK-NEXT: [[TMP81:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP69]]
				; CHECK-NEXT: [[TMP82:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP70]]
				; CHECK-NEXT: [[TMP83:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[TMP71]]
				; CHECK-NEXT: [[TMP84:%.]] = getelementptr inbounds float, float [[TMP72]], i32 0
				; CHECK-NEXT: [[TMP85:%.]] = bitcast float [[TMP84]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD34:%.]] = load <4 x float>, <4 x float> [[TMP85]], align 4
				; CHECK-NEXT: [[TMP86:%.]] = getelementptr inbounds float, float [[TMP72]], i32 4
				; CHECK-NEXT: [[TMP87:%.]] = bitcast float [[TMP86]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD35:%.]] = load <4 x float>, <4 x float> [[TMP87]], align 4
				; CHECK-NEXT: [[TMP88:%.]] = getelementptr inbounds float, float [[TMP72]], i32 8
				; CHECK-NEXT: [[TMP89:%.]] = bitcast float [[TMP88]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD36:%.]] = load <4 x float>, <4 x float> [[TMP89]], align 4
				; CHECK-NEXT: [[TMP90:%.]] = getelementptr inbounds float, float [[TMP72]], i32 12
				; CHECK-NEXT: [[TMP91:%.]] = bitcast float [[TMP90]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD37:%.]] = load <4 x float>, <4 x float> [[TMP91]], align 4
				; CHECK-NEXT: [[TMP92:%.]] = getelementptr inbounds float, float [[TMP72]], i32 16
				; CHECK-NEXT: [[TMP93:%.]] = bitcast float [[TMP92]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD38:%.]] = load <4 x float>, <4 x float> [[TMP93]], align 4
				; CHECK-NEXT: [[TMP94:%.]] = getelementptr inbounds float, float [[TMP72]], i32 20
				; CHECK-NEXT: [[TMP95:%.]] = bitcast float [[TMP94]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD39:%.]] = load <4 x float>, <4 x float> [[TMP95]], align 4
				; CHECK-NEXT: [[TMP96:%.]] = getelementptr inbounds float, float [[TMP72]], i32 24
				; CHECK-NEXT: [[TMP97:%.]] = bitcast float [[TMP96]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD40:%.]] = load <4 x float>, <4 x float> [[TMP97]], align 4
				; CHECK-NEXT: [[TMP98:%.]] = getelementptr inbounds float, float [[TMP72]], i32 28
				; CHECK-NEXT: [[TMP99:%.]] = bitcast float [[TMP98]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD41:%.]] = load <4 x float>, <4 x float> [[TMP99]], align 4
				; CHECK-NEXT: [[TMP100:%.]] = getelementptr inbounds float, float [[TMP72]], i32 32
				; CHECK-NEXT: [[TMP101:%.]] = bitcast float [[TMP100]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD42:%.]] = load <4 x float>, <4 x float> [[TMP101]], align 4
				; CHECK-NEXT: [[TMP102:%.]] = getelementptr inbounds float, float [[TMP72]], i32 36
				; CHECK-NEXT: [[TMP103:%.]] = bitcast float [[TMP102]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD43:%.]] = load <4 x float>, <4 x float> [[TMP103]], align 4
				; CHECK-NEXT: [[TMP104:%.]] = getelementptr inbounds float, float [[TMP72]], i32 40
				; CHECK-NEXT: [[TMP105:%.]] = bitcast float [[TMP104]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD44:%.]] = load <4 x float>, <4 x float> [[TMP105]], align 4
				; CHECK-NEXT: [[TMP106:%.]] = getelementptr inbounds float, float [[TMP72]], i32 44
				; CHECK-NEXT: [[TMP107:%.]] = bitcast float [[TMP106]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD45:%.]] = load <4 x float>, <4 x float> [[TMP107]], align 4
				; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD34]]
				; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]]
				; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <4 x float> [[WIDE_LOAD24]], [[WIDE_LOAD36]]
				; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <4 x float> [[WIDE_LOAD25]], [[WIDE_LOAD37]]
				; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <4 x float> [[WIDE_LOAD26]], [[WIDE_LOAD38]]
				; CHECK-NEXT: [[TMP113:%.*]] = fmul fast <4 x float> [[WIDE_LOAD27]], [[WIDE_LOAD39]]
				; CHECK-NEXT: [[TMP114:%.*]] = fmul fast <4 x float> [[WIDE_LOAD28]], [[WIDE_LOAD40]]
				; CHECK-NEXT: [[TMP115:%.*]] = fmul fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD41]]
				; CHECK-NEXT: [[TMP116:%.*]] = fmul fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD42]]
				; CHECK-NEXT: [[TMP117:%.*]] = fmul fast <4 x float> [[WIDE_LOAD31]], [[WIDE_LOAD43]]
				; CHECK-NEXT: [[TMP118:%.*]] = fmul fast <4 x float> [[WIDE_LOAD32]], [[WIDE_LOAD44]]
				; CHECK-NEXT: [[TMP119:%.*]] = fmul fast <4 x float> [[WIDE_LOAD33]], [[WIDE_LOAD45]]
				; CHECK-NEXT: [[TMP120:%.*]] = fpext <4 x float> [[TMP108]] to <4 x double>
				; CHECK-NEXT: [[TMP121:%.*]] = fpext <4 x float> [[TMP109]] to <4 x double>
				; CHECK-NEXT: [[TMP122:%.*]] = fpext <4 x float> [[TMP110]] to <4 x double>
				; CHECK-NEXT: [[TMP123:%.*]] = fpext <4 x float> [[TMP111]] to <4 x double>
				; CHECK-NEXT: [[TMP124:%.*]] = fpext <4 x float> [[TMP112]] to <4 x double>
				; CHECK-NEXT: [[TMP125:%.*]] = fpext <4 x float> [[TMP113]] to <4 x double>
				; CHECK-NEXT: [[TMP126:%.*]] = fpext <4 x float> [[TMP114]] to <4 x double>
				; CHECK-NEXT: [[TMP127:%.*]] = fpext <4 x float> [[TMP115]] to <4 x double>
				; CHECK-NEXT: [[TMP128:%.*]] = fpext <4 x float> [[TMP116]] to <4 x double>
				; CHECK-NEXT: [[TMP129:%.*]] = fpext <4 x float> [[TMP117]] to <4 x double>
				; CHECK-NEXT: [[TMP130:%.*]] = fpext <4 x float> [[TMP118]] to <4 x double>
				; CHECK-NEXT: [[TMP131:%.*]] = fpext <4 x float> [[TMP119]] to <4 x double>
				; CHECK-NEXT: [[TMP132]] = fadd fast <4 x double> [[VEC_PHI]], [[TMP120]]
				; CHECK-NEXT: [[TMP133]] = fadd fast <4 x double> [[VEC_PHI1]], [[TMP121]]
				; CHECK-NEXT: [[TMP134]] = fadd fast <4 x double> [[VEC_PHI2]], [[TMP122]]
				; CHECK-NEXT: [[TMP135]] = fadd fast <4 x double> [[VEC_PHI3]], [[TMP123]]
				; CHECK-NEXT: [[TMP136]] = fadd fast <4 x double> [[VEC_PHI4]], [[TMP124]]
				; CHECK-NEXT: [[TMP137]] = fadd fast <4 x double> [[VEC_PHI5]], [[TMP125]]
				; CHECK-NEXT: [[TMP138]] = fadd fast <4 x double> [[VEC_PHI6]], [[TMP126]]
				; CHECK-NEXT: [[TMP139]] = fadd fast <4 x double> [[VEC_PHI7]], [[TMP127]]
				; CHECK-NEXT: [[TMP140]] = fadd fast <4 x double> [[VEC_PHI8]], [[TMP128]]
				; CHECK-NEXT: [[TMP141]] = fadd fast <4 x double> [[VEC_PHI9]], [[TMP129]]
				; CHECK-NEXT: [[TMP142]] = fadd fast <4 x double> [[VEC_PHI10]], [[TMP130]]
				; CHECK-NEXT: [[TMP143]] = fadd fast <4 x double> [[VEC_PHI11]], [[TMP131]]
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 48
				; CHECK-NEXT: [[TMP144:%.*]] = icmp eq i32 [[INDEX_NEXT]], 288
				; CHECK-NEXT: br i1 [[TMP144]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
				; CHECK: middle.block:
				; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP133]], [[TMP132]]
				; CHECK-NEXT: [[BIN_RDX46:%.*]] = fadd fast <4 x double> [[TMP134]], [[BIN_RDX]]
				; CHECK-NEXT: [[BIN_RDX47:%.*]] = fadd fast <4 x double> [[TMP135]], [[BIN_RDX46]]
				; CHECK-NEXT: [[BIN_RDX48:%.*]] = fadd fast <4 x double> [[TMP136]], [[BIN_RDX47]]
				; CHECK-NEXT: [[BIN_RDX49:%.*]] = fadd fast <4 x double> [[TMP137]], [[BIN_RDX48]]
				; CHECK-NEXT: [[BIN_RDX50:%.*]] = fadd fast <4 x double> [[TMP138]], [[BIN_RDX49]]
				; CHECK-NEXT: [[BIN_RDX51:%.*]] = fadd fast <4 x double> [[TMP139]], [[BIN_RDX50]]
				; CHECK-NEXT: [[BIN_RDX52:%.*]] = fadd fast <4 x double> [[TMP140]], [[BIN_RDX51]]
				; CHECK-NEXT: [[BIN_RDX53:%.*]] = fadd fast <4 x double> [[TMP141]], [[BIN_RDX52]]
				; CHECK-NEXT: [[BIN_RDX54:%.*]] = fadd fast <4 x double> [[TMP142]], [[BIN_RDX53]]
				; CHECK-NEXT: [[BIN_RDX55:%.*]] = fadd fast <4 x double> [[TMP143]], [[BIN_RDX54]]
				; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[BIN_RDX55]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
				; CHECK-NEXT: [[BIN_RDX56:%.*]] = fadd fast <4 x double> [[BIN_RDX55]], [[RDX_SHUF]]
				; CHECK-NEXT: [[RDX_SHUF57:%.*]] = shufflevector <4 x double> [[BIN_RDX56]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
				; CHECK-NEXT: [[BIN_RDX58:%.*]] = fadd fast <4 x double> [[BIN_RDX56]], [[RDX_SHUF57]]
				; CHECK-NEXT: [[TMP145:%.*]] = extractelement <4 x double> [[BIN_RDX58]], i32 0
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 300, 288
				; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i32 [ 288, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ]
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[AGGR_PROD_02:%.]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.]], [[FOR_INC:%.*]] ]
				; CHECK-NEXT: [[I_01:%.]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.]], [[FOR_INC]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I_01]] to i64
				; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds float, float [[THING1]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[TMP146:%.]] = load float, float [[ARRAYIDX]], align 4
				; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[I_01]] to i64
				; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds float, float [[THING2]], i64 [[IDXPROM1]]
				; CHECK-NEXT: [[TMP147:%.]] = load float, float [[ARRAYIDX2]], align 4
				; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP146]], [[TMP147]]
				; CHECK-NEXT: [[CONV:%.*]] = fpext float [[MUL]] to double
				; CHECK-NEXT: [[ADD]] = fadd fast double [[AGGR_PROD_02]], [[CONV]]
				; CHECK-NEXT: br label [[FOR_INC]]
				; CHECK: for.inc:
				; CHECK-NEXT: [[INC]] = add nsw i32 [[I_01]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 300
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !2
				; CHECK: for.end:
				; CHECK-NEXT: [[AGGR_PROD_0_LCSSA:%.*]] = phi double [ [[ADD]], [[FOR_INC]] ], [ [[TMP145]], [[MIDDLE_BLOCK]] ]
				; CHECK-NEXT: [[CONV3:%.*]] = fptrunc double [[AGGR_PROD_0_LCSSA]] to float
				; CHECK-NEXT: ret float [[CONV3]]
				;
				entry:
				br label %for.body

				for.body: ; preds = %entry, %for.inc
				%aggr_prod.02 = phi double [ 0.000000e+00, %entry ], [ %add, %for.inc ]
				%i.01 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
				%idxprom = sext i32 %i.01 to i64
				%arrayidx = getelementptr inbounds float, float* %thing1, i64 %idxprom
				%0 = load float, float* %arrayidx, align 4
				%idxprom1 = sext i32 %i.01 to i64
				%arrayidx2 = getelementptr inbounds float, float* %thing2, i64 %idxprom1
				%1 = load float, float* %arrayidx2, align 4
				%mul = fmul fast float %0, %1
				%conv = fpext float %mul to double
				%add = fadd fast double %aggr_prod.02, %conv
				br label %for.inc

				for.inc: ; preds = %for.body
				%inc = add nsw i32 %i.01, 1
				%cmp = icmp slt i32 %inc, 300
				br i1 %cmp, label %for.body, label %for.end

				for.end: ; preds = %for.inc
				%aggr_prod.0.lcssa = phi double [ %add, %for.inc ]
				%conv3 = fptrunc double %aggr_prod.0.lcssa to float
				ret float %conv3
				}

				; Function Attrs: argmemonly nounwind willreturn
				declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)

				; Function Attrs: argmemonly nounwind willreturn
				declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)

				attributes #0 = { nounwind }

llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll

; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S \| FileCheck %s		; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx \
		; RUN: -force-vector-interleave=1 -S \| FileCheck %s

target triple = "powerpc64-unknown-linux-gnu"		target triple = "powerpc64-unknown-linux-gnu"

define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {		define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
entry:		entry:
%idx.ext = sext i32 %l to i64		%idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext		%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0		%cmp7 = icmp sgt i32 %l, 0
Show All 16 Lines
while.end.loopexit: ; preds = %while.body		while.end.loopexit: ; preds = %while.body
%add.lcssa = phi i32 [ %add, %while.body ]		%add.lcssa = phi i32 [ %add, %while.body ]
br label %while.end		br label %while.end

while.end: ; preds = %while.end.loopexit, %entry		while.end: ; preds = %while.end.loopexit, %entry
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]		%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i32 %count.0.lcssa		ret i32 %count.0.lcssa

; CHECK: load <4 x i8>		; CHECK: load <16 x i8>
; CHECK: icmp slt <4 x i8>		; CHECK: icmp slt <16 x i8>
}		}


define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {		define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
entry:		entry:
%idx.ext = sext i32 %l to i64		%idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext		%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0		%cmp7 = icmp sgt i32 %l, 0
Show All 17 Lines	while.end.loopexit: ; preds = %while.body
%add.lcssa = phi i16 [ %add, %while.body ]		%add.lcssa = phi i16 [ %add, %while.body ]
br label %while.end		br label %while.end

while.end: ; preds = %while.end.loopexit, %entry		while.end: ; preds = %while.end.loopexit, %entry
%count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]		%count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i16 %count.0.lcssa		ret i16 %count.0.lcssa

; CHECK-LABEL: foo2		; CHECK-LABEL: foo2
; CHECK: load <8 x i8>		; CHECK: load <16 x i8>
; CHECK: icmp slt <8 x i8>		; CHECK: icmp slt <16 x i8>
}		}

define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {		define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
entry:		entry:
%idx.ext = sext i32 %l to i64		%idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext		%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0		%cmp7 = icmp sgt i32 %l, 0
br i1 %cmp7, label %while.body.preheader, label %while.end		br i1 %cmp7, label %while.body.preheader, label %while.end
Show All 16 Lines	while.end.loopexit: ; preds = %while.body
%add.lcssa = phi i32 [ %add, %while.body ]		%add.lcssa = phi i32 [ %add, %while.body ]
br label %while.end		br label %while.end

while.end: ; preds = %while.end.loopexit, %entry		while.end: ; preds = %while.end.loopexit, %entry
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]		%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i32 %count.0.lcssa		ret i32 %count.0.lcssa

; CHECK-LABEL: foo3		; CHECK-LABEL: foo3
; CHECK: load <4 x i16>		; CHECK: load <8 x i16>
; CHECK: icmp slt <4 x i16>		; CHECK: icmp slt <8 x i16>
}		}

define i64 @foo4(i16* readonly %ptr, i32 signext %l) {		define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
entry:		entry:
%idx.ext = sext i32 %l to i64		%idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext		%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0		%cmp7 = icmp sgt i32 %l, 0
br i1 %cmp7, label %while.body.preheader, label %while.end		br i1 %cmp7, label %while.body.preheader, label %while.end
Show All 16 Lines	while.end.loopexit: ; preds = %while.body
%add.lcssa = phi i64 [ %add, %while.body ]		%add.lcssa = phi i64 [ %add, %while.body ]
br label %while.end		br label %while.end

while.end: ; preds = %while.end.loopexit, %entry		while.end: ; preds = %while.end.loopexit, %entry
%count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]		%count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i64 %count.0.lcssa		ret i64 %count.0.lcssa

; CHECK-LABEL: foo4		; CHECK-LABEL: foo4
; CHECK: load <2 x i16>		; CHECK: load <8 x i16>
; CHECK: icmp slt <2 x i16>		; CHECK: icmp slt <8 x i16>
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[PowerPC] Improve vectorization of loops that operate on values that are extended in the bodyChanges PlannedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 228482

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h

llvm/test/CodeGen/PowerPC/vec_fmuladd.ll

llvm/test/Transforms/LoopVectorize/PowerPC/max-vec-bandwidth.ll

llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll

[PowerPC] Improve vectorization of loops that operate on values that are extended in the body
Changes PlannedPublic