diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3190,6 +3190,75 @@ return Cost; } +static bool isLoadOrFreeCast(X86TTIImpl *TTI, const Value *V, + TTI::TargetCostKind CostKind) { + if (isa(V)) + return true; + const auto *I = dyn_cast(V); + if (!I) + return false; + switch (I->getOpcode()) { + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::PtrToInt: + // Check if the cast of the load is free if the cast operation is free. + return I->hasOneUse() && + TTI->getCastInstrCost( + I->getOpcode(), I->getType(), I->getOperand(0)->getType(), + TTI::getCastContextHint(I), CostKind, I) == 0 && + isa(I->getOperand(0)) && + I->getParent() == cast(I->getOperand(0))->getParent(); + default: + break; + } + return false; +} + +static bool isFreeOp(X86TTIImpl *TTI, const Instruction *I, const Type *Ty, + TTI::TargetCostKind CostKind, const Instruction *OpI) { + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + bool IsLoadOrFreeCast0 = isLoadOrFreeCast(TTI, I->getOperand(0), CostKind); + bool IsLoadOrFreeCast1 = isLoadOrFreeCast(TTI, I->getOperand(1), CostKind); + bool IsNotConstant0 = !isa(I->getOperand(0)); + bool IsNotConstant1 = !isa(I->getOperand(1)); + bool IsSingleUseOp0 = I->getOperand(0)->hasOneUse(); + bool IsSingleUseOp1 = I->getOperand(1)->hasOneUse(); + bool SameParentOp0 = + isa(I->getOperand(0)) && + cast(I->getOperand(0))->getParent() == OpI->getParent(); + bool SameParentOp1 = + isa(I->getOperand(1)) && + cast(I->getOperand(1))->getParent() == OpI->getParent(); + return (I->getOperand(1) == OpI && !IsLoadOrFreeCast0 && IsSingleUseOp0 && + IsNotConstant0 && SameParentOp0) || + (I->getOperand(0) == OpI && !IsLoadOrFreeCast1 && IsSingleUseOp1 && + IsNotConstant1 && SameParentOp1 && + (!isa(I) || I->hasAllowReassoc())) || + (IsLoadOrFreeCast0 && IsLoadOrFreeCast1 && + (((!IsSingleUseOp1 || !SameParentOp1) && I->getOperand(1) == OpI) || + ((!IsSingleUseOp0 || !SameParentOp0) && I->getOperand(0) == OpI && + (!isa(I) || I->hasAllowReassoc())))) || + I->getOperand(1) == OpI; + } + case Instruction::Sub: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FSub: + case Instruction::FDiv: + return I->getOperand(1) == OpI; + } + return false; +} + int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -3252,6 +3321,33 @@ if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) Cost *= 2; + if (Opcode == Instruction::Load && Cost == LT.first && !ST->isSLM() && + isa_and_nonnull(I) && I->hasOneUse()) { + // TODO: Include SLM when the cost model is correct. + // Check if the cost of the scalar load can be considered as 0. We can + // consider it 0, if arithmetic|logic|compare|cast instruction allows to use + // memory address as one of its argument. + const Instruction *UI = I->user_back(); + if (UI->getParent() == I->getParent() && + isFreeOp(this, UI, UI->getType(), CostKind, I)) + return 0; + switch (UI->getOpcode()) { + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::PtrToInt: + // Check if the cast of the load is free if the cast operation is free. + if (UI->user_back()->getParent() == UI->getParent() && UI->hasOneUse() && + getCastInstrCost(UI->getOpcode(), UI->getType(), + UI->getOperand(0)->getType(), + TTI::getCastContextHint(UI), CostKind, UI) == 0 && + isFreeOp(this, UI->user_back(), UI->getType(), CostKind, UI)) + return 0; + break; + default: + break; + } + } + return Cost; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3764,22 +3764,33 @@ } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - Align alignment = cast(VL0)->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); + const auto *LI = cast(VL0); + Align Alignment = LI->getAlign(); + InstructionCost ScalarLdCost = 0; if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + for (const unsigned I : E->ReuseShuffleIndices) { + const auto *LD = cast(VL[I]); + ReuseShuffleCost -= + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + LD->getPointerAddressSpace(), CostKind, LD); + } + } else { + for (Value *V : VL) { + const auto *LD = cast(V); + ScalarLdCost += + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + LD->getPointerAddressSpace(), CostKind, LD); + } } - InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, CostKind, VL0); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, alignment, CostKind, VL0); + /*VariableMask=*/false, Alignment, CostKind, VL0); } if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) { SmallVector NewMask; diff --git a/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll b/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll --- a/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll +++ b/llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -10,7 +10,7 @@ ; Function Attrs: nounwind uwtable define void @load_i32_interleave4() { ;CHECK-LABEL: load_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 0 for VF 1 For instruction: %0 = load ;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load ;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load ;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load @@ -46,7 +46,7 @@ define void @load_i32_interleave5() { ;CHECK-LABEL: load_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 0 for VF 1 For instruction: %0 = load ;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load ;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load ;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load diff --git a/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll b/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll --- a/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll +++ b/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll @@ -58,7 +58,7 @@ ;CHECK: cost of 1 {{.*}} mul %mul = mul nsw i32 %14, 5 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 0 {{.*}} load %15 = load i32, i32* %arrayidx2, align 4 %add3 = add nsw i32 %15, %mul store i32 %add3, i32* %arrayidx2, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX -; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE @@ -28,32 +28,52 @@ ; ; AVX-LABEL: @foo( ; AVX-NEXT: entry: -; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX: vector.ph: -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; AVX-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* -; AVX-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 -; AVX-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; AVX-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 -; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; AVX-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; AVX-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] -; AVX: middle.block: -; AVX-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; AVX: scalar.ph: ; AVX-NEXT: br label [[FOR_BODY:%.*]] ; AVX: for.cond.cleanup: ; AVX-NEXT: ret void ; AVX: for.body: -; AVX-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; AVX-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] +; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP3]], [[TMP1]] +; AVX-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX6]], align 4 +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +; AVX2-LABEL: @foo( +; AVX2-NEXT: entry: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 +; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 +; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; AVX2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; AVX2: middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; AVX2: scalar.ph: +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.cond.cleanup: +; AVX2-NEXT: ret void +; AVX2: for.body: +; AVX2-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -4,7 +4,7 @@ ; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount) ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s --check-prefix=AVX512 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -17,22 +17,22 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] ; CHECK: middle.block: @@ -55,6 +55,51 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; AVX512-LABEL: @small_tc( +; AVX512-NEXT: entry: +; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX512: vector.ph: +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 +; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; AVX512-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; AVX512-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* +; AVX512-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; AVX512-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; AVX512-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; AVX512: middle.block: +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX512: scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -27,23 +27,23 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -166,22 +166,22 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll @@ -248,16 +248,18 @@ define float @fadd_v4i32(float* %p) #0 { ; CHECK-LABEL: @fadd_v4i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7:!tbaa !.*]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast float -0.000000e+00, [[TMP2]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[BIN_RDX5]], 4.200000e+01 -; CHECK-NEXT: ret float [[OP_EXTRA]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7:!tbaa !.*]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], 4.200000e+01 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] +; CHECK-NEXT: ret float [[ADD_3]] ; entry: br label %for.cond @@ -289,16 +291,18 @@ define float @fmul_v4i32(float* %p) #0 { ; CHECK-LABEL: @fmul_v4i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[BIN_RDX5:%.*]] = fmul fast float 1.000000e+00, [[TMP2]] -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fmul fast float [[BIN_RDX5]], 4.200000e+01 -; CHECK-NEXT: ret float [[OP_EXTRA]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP0]], 4.200000e+01 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP1]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP2]], [[MUL_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP3]], [[MUL_2]] +; CHECK-NEXT: ret float [[MUL_3]] ; entry: br label %for.cond diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -126,13 +126,28 @@ define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolerance) { ; CHECK-LABEL: @TestVectorsEqual_alt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[VEC0:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[VEC0]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[VEC1]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[ADD_3:%.*]] = sub i32 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[ADD_3]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ; @@ -233,13 +248,28 @@ define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, float %Tolerance) { ; CHECK-LABEL: @TestVectorsEqualFP_alt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[VEC0:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX2_2]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[VEC0]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, float* [[VEC1]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[ADD_3:%.*]] = fsub fast float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[ADD_3]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx -slp-threshold=-1 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -791,11 +791,13 @@ ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -804,8 +806,8 @@ ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 @@ -822,16 +824,18 @@ ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; CHECK-NEXT: ret float [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]] +; CHECK-NEXT: ret float [[TMP17]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -153,24 +153,22 @@ ; CHECK-NEXT: [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1 ; CHECK-NEXT: [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2 ; CHECK-NEXT: [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]] +; CHECK-NEXT: [[X0:%.*]] = load i64, i64* [[P0]], align 2 +; CHECK-NEXT: [[X1:%.*]] = load i64, i64* [[P1]], align 2 +; CHECK-NEXT: [[X2:%.*]] = load i64, i64* [[P2]], align 2 +; CHECK-NEXT: [[X3:%.*]] = load i64, i64* [[P3]], align 2 +; CHECK-NEXT: [[Y0:%.*]] = load i64, i64* [[Q0]], align 2 +; CHECK-NEXT: [[Y1:%.*]] = load i64, i64* [[Q1]], align 2 +; CHECK-NEXT: [[Y2:%.*]] = load i64, i64* [[Q2]], align 2 +; CHECK-NEXT: [[Y3:%.*]] = load i64, i64* [[Q3]], align 2 +; CHECK-NEXT: [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]] +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]] +; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[SUB0]] +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB1]] +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB2]] +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB3]] ; CHECK-NEXT: ret void ; %p0 = getelementptr inbounds i64, i64* %p, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -153,24 +153,22 @@ ; CHECK-NEXT: [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1 ; CHECK-NEXT: [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2 ; CHECK-NEXT: [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]] +; CHECK-NEXT: [[X0:%.*]] = load i64, i64* [[P0]], align 2 +; CHECK-NEXT: [[X1:%.*]] = load i64, i64* [[P1]], align 2 +; CHECK-NEXT: [[X2:%.*]] = load i64, i64* [[P2]], align 2 +; CHECK-NEXT: [[X3:%.*]] = load i64, i64* [[P3]], align 2 +; CHECK-NEXT: [[Y0:%.*]] = load i64, i64* [[Q0]], align 2 +; CHECK-NEXT: [[Y1:%.*]] = load i64, i64* [[Q1]], align 2 +; CHECK-NEXT: [[Y2:%.*]] = load i64, i64* [[Q2]], align 2 +; CHECK-NEXT: [[Y3:%.*]] = load i64, i64* [[Q3]], align 2 +; CHECK-NEXT: [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]] +; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]] +; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]] +; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[SUB0]] +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB1]] +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB2]] +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[SUB3]] ; CHECK-NEXT: ret void ; %p0 = getelementptr inbounds i64, i64* %p, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -319,36 +319,34 @@ ; CHECK-LABEL: @lookahead_limit_users_budget( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 +; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[B]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] +; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]] +; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]] +; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8 +; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -34,17 +34,16 @@ ; AVX-NEXT: [[OR_1:%.*]] = or i64 undef, 1 ; AVX-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; AVX-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 +; AVX-NEXT: [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8 ; AVX-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; AVX-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; AVX-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 +; AVX-NEXT: [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8 ; AVX-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; AVX-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]] +; AVX-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]] ; AVX-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 +; AVX-NEXT: store i64 [[AND_2]], i64* [[BAR3]], align 8 ; AVX-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; AVX-NEXT: store i64 [[AND_1]], i64* [[BAR4]], align 8 ; AVX-NEXT: ret void ; for.body.lr.ph.i: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -63,45 +63,31 @@ ; } define i32 @test_mul(i32* nocapture readonly %p) { -; AVX-LABEL: @test_mul( -; AVX-NEXT: entry: -; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) -; AVX-NEXT: ret i32 [[TMP2]] -; -; SSE-LABEL: @test_mul( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; SSE-NEXT: [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]] -; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; SSE-NEXT: [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]] -; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; SSE-NEXT: [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]] -; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 -; SSE-NEXT: [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]] -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 -; SSE-NEXT: [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]] -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 -; SSE-NEXT: [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]] -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 -; SSE-NEXT: [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]] -; SSE-NEXT: ret i32 [[MUL_714]] +; CHECK-LABEL: @test_mul( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; CHECK-NEXT: [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; CHECK-NEXT: [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]] +; CHECK-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -106,7 +106,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' - ; YAML-NEXT: - Cost: '-5' + ; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4'