Diff 340881

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines

#include "llvm/Support/Compiler.h" #include "llvm/Support/Compiler.h"

#include "llvm/Support/Debug.h" #include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/InstructionCost.h" #include "llvm/Support/InstructionCost.h"

#include "llvm/Support/MathExtras.h" #include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h" #include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopSimplify.h"

#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/LoopVersioning.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"

#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Utils/SizeOpts.h"

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

#include <algorithm> #include <algorithm>

#include <cassert> #include <cassert>

#include <cstdint> #include <cstdint>

#include <cstdlib> #include <cstdlib>

#include <functional> #include <functional>

#include <iterator> #include <iterator>

▲ Show 20 Lines • Show All 9,825 Lines • ▼ Show 20 Lines if (RemainderLoopID.hasValue()) {

// Mark the loop as already vectorized to avoid vectorizing again. // Mark the loop as already vectorized to avoid vectorizing again.

Hints.setAlreadyVectorized(); Hints.setAlreadyVectorized();

} }

assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));

return true; return true;

} }

static bool sinkAndHoistInstsInLoop(Loop *L, DominatorTree &DT, LoopInfo &LI,

TargetTransformInfo &TTI,

ScalarEvolution &SE) {

bool Changed = false;

DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

lebedev.riUnsubmitted

Not Done

FWIW these sink/hoist helpers are fine with lazy updates.

lebedev.ri: FWIW these sink/hoist helpers are fine with lazy updates.

fhahnAuthorUnsubmitted

Done

I'm not sure what the issue is exactly, but I think there are some cases where combining sinking & hoisting could result in some duplicated/outdated updates which would require applyUpdatesPermissive

fhahn: I'm not sure what the issue is exactly, but I think there are some cases where combining…

SmallVector<BasicBlock *, 16> WorkList(reverse(L->blocks()));

while (!WorkList.empty()) {

BasicBlock *BB = WorkList.pop_back_val();

// Skip blocks that may have been removed earlier.

if (LI.getLoopFor(BB) != L)

continue;

// Remove block if it became dead.

if (pred_begin(BB) == pred_end(BB)) {

lebedev.riUnsubmitted

Not Done

pred_empty(BB)

lebedev.ri: pred_empty(BB)

LI.removeBlock(BB);

DeleteDeadBlock(BB, &DTU);

continue;

}

// If BB has a conditional branch and its successors are in the same loop,

// try to hoist common code to BB.

auto *BI = dyn_cast<BranchInst>(BB->getTerminator());

if (BI && BI->isConditional() &&

BI->getSuccessor(0)->getSinglePredecessor() &&

BI->getSuccessor(1)->getSinglePredecessor() &&

all_of(successors(BB), [&LI, L](BasicBlock *Succ) {

return LI.getLoopFor(Succ) == L;

lebedev.riUnsubmitted

Not Done

if (BI && BI->isConditional() &&

- BI->getSuccessor(0)->getSinglePredecessor() &&

- BI->getSuccessor(1)->getSinglePredecessor() &&

all_of(successors(BB), [&LI, L](BasicBlock *Succ) {

- return LI.getLoopFor(Succ) == L;

+ return Succ->getSinglePredecessor() && LI.getLoopFor(Succ) == L;

})) {

lebedev.ri:

})) {

auto *OldThen = BI->getSuccessor(0);

auto *OldElse = BI->getSuccessor(1);

if (hoistThenElseCodeToIf(BI, TTI, false, &DTU)) {

lebedev.riUnsubmitted

Not Done

auto *OldElse = BI->getSuccessor(1);

- if (hoistThenElseCodeToIf(BI, TTI, false, &DTU)) {

+ if (hoistThenElseCodeToIf(BI, TTI, /*EqTermsOnly=*/false, &DTU)) {

Changed = true;

lebedev.ri:

Changed = true;

lebedev.riUnsubmitted

Not Done

I think it would be fine to sink that into hoistThenElseCodeToIf().

lebedev.ri: I think it would be fine to sink that into `hoistThenElseCodeToIf()`.

fhahnAuthorUnsubmitted

Done

Unfortunately I think that would be incompatible with the use in SimplifyCFG, which does not seem to handle removing BBs other than the current one well.

But I adjusted the loop to remove any blocks that became dead at the top-level, which hopefully makes things a bit clearer.

fhahn: Unfortunately I think that would be incompatible with the use in SimplifyCFG, which does not…

lebedev.riUnsubmitted

Done

Hmm, true. Disregard then.

lebedev.ri: Hmm, true. Disregard then.

if (OldThen->size() == 1)

WorkList.push_back(OldThen);

if (OldElse->size() == 1)

WorkList.push_back(OldElse);

}

// If all predecessors are in the current loop, try to sink instructions

// to BB.

if (all_of(predecessors(BB),

[&LI, L](BasicBlock *Pred) { return LI.getLoopFor(Pred) == L; }))

Changed |= sinkCommonCodeFromPredecessors(BB, &DTU, &LI);

}

if (Changed) {

lebedev.riUnsubmitted

Not Done

I think it would be fine to sink that into sinkCommonCodeFromPredecessors().

lebedev.ri: I think it would be fine to sink that into `sinkCommonCodeFromPredecessors()`.

fhahnAuthorUnsubmitted

Done

Thanks, that's a great idea. I put up D101368 to optionally preserve LI in sinkCommonCodeFromPredecessors

fhahn: Thanks, that's a great idea. I put up D101368 to optionally preserve LI in…

SE.forgetTopmostLoop(L);

#ifdef EXPENSIVE_CHECKS

assert(DTU.getDomTree().verify(DominatorTree::VerificationLevel::Full));

LI.verify(DT);

#endif

}

return Changed;

}

LoopVectorizeResult LoopVectorizePass::runImpl( LoopVectorizeResult LoopVectorizePass::runImpl(

Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,

DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,

DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,

std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,

OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {

SE = &SE_; SE = &SE_;

LI = &LI_; LI = &LI_;

Show All 39 Lines for (Loop *L : *LI)

collectSupportedLoops(*L, LI, ORE, Worklist); collectSupportedLoops(*L, LI, ORE, Worklist);

LoopsAnalyzed += Worklist.size(); LoopsAnalyzed += Worklist.size();

// Now walk the identified inner loops. // Now walk the identified inner loops.

while (!Worklist.empty()) { while (!Worklist.empty()) {

Loop *L = Worklist.pop_back_val(); Loop *L = Worklist.pop_back_val();

Changed |= CFGChanged |= sinkAndHoistInstsInLoop(L, *DT, *LI, *TTI, *SE);

// For the inner loops we actually process, form LCSSA to simplify the // For the inner loops we actually process, form LCSSA to simplify the

// transform. // transform.

Changed |= formLCSSARecursively(*L, *DT, LI, SE); Changed |= formLCSSARecursively(*L, *DT, LI, SE);

Changed |= CFGChanged |= processLoop(L); Changed |= CFGChanged |= processLoop(L);

} }

// Process each loop nest in the function. // Process each loop nest in the function.

▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -verify-dom-info -verify-loop-info -S %s \| FileCheck %s			; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -verify-dom-info -verify-loop-info -force-vector-interleave=1 -S %s \| FileCheck %s

	define void @test(float* %A, float* %B, float %x) {			define void @test(float* %A, float* %B, float %x) {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: entry:			; CHECK: vector.ph:
	; CHECK-NEXT: [[CMP:%.]] = fcmp olt float [[X:%.]], 2.000000e+01			; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float %x, i32 0
	; CHECK-NEXT: br label [[LOOP:%.*]]			; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
	; CHECK: loop:			; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i1> poison, i1 %cmp, i32 0
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[IV_NEXT:%.]], [[LOOP_LATCH:%.]] ], [ 0, [[ENTRY:%.]] ]			; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT7]], <4 x i1> poison, <4 x i32> zeroinitializer
	; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.]], label [[ELSE:%.]]			; CHECK-NEXT: br label %vector.body
	; CHECK: then:			; CHECK: vector.body:
	; CHECK-NEXT: [[A_GEP_0:%.]] = getelementptr inbounds float, float [[A:%.*]], i64 [[IV]]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.]], %vector.body ]
	; CHECK-NEXT: [[A_LV_0:%.]] = load float, float [[A_GEP_0]], align 4			; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
	; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]]			; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds float, float %A, i64 [[TMP0]]
	; CHECK-NEXT: [[B_GEP_0:%.]] = getelementptr inbounds float, float [[B:%.*]], i64 [[IV]]			; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds float, float [[TMP1]], i32 0
	; CHECK-NEXT: store float [[MUL2_I81_I]], float* [[B_GEP_0]], align 4			; CHECK-NEXT: [[TMP3:%.]] = bitcast float [[TMP2]] to <4 x float>*
	; CHECK-NEXT: br label [[LOOP_LATCH]]			; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float> [[TMP3]], align 4
	; CHECK: else:			; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
	; CHECK-NEXT: [[A_GEP_1:%.]] = getelementptr inbounds float, float [[A]], i64 [[IV]]			; CHECK-NEXT: [[TMP5:%.]] = getelementptr inbounds float, float %B, i64 [[TMP0]]
	; CHECK-NEXT: [[A_LV_1:%.]] = load float, float [[A_GEP_1]], align 4			; CHECK-NEXT: [[TMP6:%.]] = getelementptr inbounds float, float [[TMP5]], i32 0
	; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[A_LV_1]], [[X]]			; CHECK-NEXT: [[TMP7:%.]] = bitcast float [[TMP6]] to <4 x float>*
	; CHECK-NEXT: [[B_GEP_1:%.]] = getelementptr inbounds float, float [[B]], i64 [[IV]]			; CHECK-NEXT: [[WIDE_LOAD6:%.]] = load <4 x float>, <4 x float> [[TMP7]], align 4
	; CHECK-NEXT: [[B_LV:%.]] = load float, float [[B_GEP_1]], align 4			; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP4]], [[WIDE_LOAD6]]
	; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL2]], [[B_LV]]			; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT8]], <i1 true, i1 true, i1 true, i1 true>
	; CHECK-NEXT: store float [[ADD]], float* [[B_GEP_1]], align 4			; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT8]], <4 x float> [[TMP4]], <4 x float> [[TMP8]]
	; CHECK-NEXT: br label [[LOOP_LATCH]]			; CHECK-NEXT: [[TMP10:%.]] = bitcast float [[TMP6]] to <4 x float>*
	; CHECK: loop.latch:			; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP10]], align 4
	; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
	; CHECK-NEXT: [[EXITCOND_NOT_I85_I:%.*]] = icmp eq i64 [[IV_NEXT]], 10000			; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
	; CHECK-NEXT: br i1 [[EXITCOND_NOT_I85_I]], label [[EXIT:%.*]], label [[LOOP]]			; CHECK-NEXT: br i1 [[TMP11]], label %middle.block, label %vector.body
	; CHECK: exit:
	; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	%cmp = fcmp olt float %x, 20.0			%cmp = fcmp olt float %x, 20.0
	br label %loop			br label %loop

	loop:			loop:
	%iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]			%iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
	br i1 %cmp, label %then, label %else			br i1 %cmp, label %then, label %else
	Show All 27 Lines

	define void @hoist_sink_only_cfg_update() {			define void @hoist_sink_only_cfg_update() {
	; CHECK-LABEL: @hoist_sink_only_cfg_update(			; CHECK-LABEL: @hoist_sink_only_cfg_update(
	; CHECK-NEXT: for.body.preheader:			; CHECK-NEXT: for.body.preheader:
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: br label [[IF_END:%.*]]			; CHECK-NEXT: br label [[IF_END:%.*]]
	; CHECK: if.end:			; CHECK: if.end:
	; CHECK-NEXT: br i1 true, label [[IF_ELSE29:%.]], label [[IF_THEN9:%.]]
	; CHECK: if.then9:
	; CHECK-NEXT: br i1 true, label [[IF_END38:%.*]], label [[IF_END38]]			; CHECK-NEXT: br i1 true, label [[IF_END38:%.*]], label [[IF_END38]]
	; CHECK: if.else29:
	; CHECK-NEXT: br i1 true, label [[IF_END38]], label [[IF_END38]]
	; CHECK: if.end38:			; CHECK: if.end38:
	; CHECK-NEXT: br label [[CLEANUP:%.*]]			; CHECK-NEXT: br label [[CLEANUP:%.*]]
	; CHECK: cleanup:			; CHECK: cleanup:
	; CHECK-NEXT: br label [[FOR_BODY]]			; CHECK-NEXT: br label [[FOR_BODY]]
	;			;
	for.body.preheader:			for.body.preheader:
	br label %for.body			br label %for.body

	Show All 11 Lines

	if.end38: ; preds = %if.else29, %if.else29, %if.then9, %if.then9			if.end38: ; preds = %if.else29, %if.else29, %if.then9, %if.then9
	br label %cleanup			br label %cleanup

	cleanup: ; preds = %if.end38			cleanup: ; preds = %if.end38
	br label %for.body			br label %for.body
	}			}

	@rd = external unnamed_addr global i32, align 4			@rd = external global i32, align 4

	define void @sink_with_new_split_block(i32* %ptr) {			define void @sink_with_new_split_block(i32* %ptr) {
	; CHECK-LABEL: @sink_with_new_split_block(			; CHECK-LABEL: @sink_with_new_split_block(
	; CHECK-NEXT: entry:			; CHECK: vector.body:
	; CHECK-NEXT: br label [[LOOP:%.*]]			; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.]], %pred.store.continue8 ]
	; CHECK: loop:			; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
	; CHECK-NEXT: [[IV:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.]], [[LOOP_LATCH:%.]] ]			; CHECK-NEXT: [[TMP1:%.]] = getelementptr i32, i32 %ptr, i32 [[TMP0]]
	; CHECK-NEXT: [[GEP:%.]] = getelementptr i32, i32 [[PTR:%.*]], i32 [[IV]]			; CHECK-NEXT: [[TMP2:%.]] = getelementptr i32, i32 [[TMP1]], i32 0
	; CHECK-NEXT: [[LV:%.]] = load i32, i32 [[GEP]], align 4			; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[TMP2]] to <4 x i32>*
	; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[LV]], 20			; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 4
	; CHECK-NEXT: br i1 [[C_0]], label [[THEN_0:%.]], label [[ELSE_0:%.]]			; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
	; CHECK: then.0:			; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 99, i32 99, i32 99, i32 99>
	; CHECK-NEXT: store i32 281701264, i32* @rd, align 4			; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
	; CHECK-NEXT: br label [[LOOP_LATCH]]			; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
	; CHECK: else.0:			; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 281701264, i32 281701264, i32 281701264, i32 281701264>, <4 x i32> zeroinitializer
	; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[LV]], 99			; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i1> [[TMP4]], [[TMP7]]
	; CHECK-NEXT: br i1 [[C_1]], label [[THEN_1:%.*]], label [[LOOP_LATCH]]			; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
	; CHECK: then.1:			; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.]], label [[PRED_STORE_CONTINUE:%.]]
	; CHECK-NEXT: store i32 0, i32* @rd, align 4			; CHECK: pred.store.if:
	; CHECK-NEXT: br label [[LOOP_LATCH]]			; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 0
	; CHECK: loop.latch:			; CHECK-NEXT: store i32 [[TMP10]], i32* @rd, align 4
	; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1			; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
	; CHECK-NEXT: [[C_2:%.*]] = icmp ne i32 [[IV_NEXT]], 2000			; CHECK: pred.store.continue:
	; CHECK-NEXT: br i1 [[C_2]], label [[LOOP]], label [[EXIT:%.*]]			; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
	; CHECK: exit:			; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.]], label [[PRED_STORE_CONTINUE4:%.]]
	; CHECK-NEXT: ret void			; CHECK: pred.store.if3:
				; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 1
				; CHECK-NEXT: store i32 [[TMP12]], i32* @rd, align 4
				; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
				; CHECK: pred.store.continue4:
				; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
				; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.]], label [[PRED_STORE_CONTINUE6:%.]]
				; CHECK: pred.store.if5:
				; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 2
				; CHECK-NEXT: store i32 [[TMP14]], i32* @rd, align 4
				; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
				; CHECK: pred.store.continue6:
				; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
				; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label %pred.store.continue8
				; CHECK: pred.store.if7:
				; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
				; CHECK-NEXT: store i32 [[TMP16]], i32* @rd, align 4,
				; CHECK-NEXT: br label %pred.store.continue8
				; CHECK: pred.store.continue8:
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2000
				; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body
	;			;
	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]			%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
	%gep = getelementptr i32, i32* %ptr, i32 %iv			%gep = getelementptr i32, i32* %ptr, i32 %iv
	%lv = load i32, i32* %gep			%lv = load i32, i32* %gep
	Show All 23 Lines

llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll

	Show First 20 Lines • Show All 381 Lines • ▼ Show 20 Lines
	define void @cannot_sink_phi(i32* %ptr) {			define void @cannot_sink_phi(i32* %ptr) {
	; CHECK-LABEL: @cannot_sink_phi(			; CHECK-LABEL: @cannot_sink_phi(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]			; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
	; CHECK: loop.header:			; CHECK: loop.header:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ 1, [[ENTRY:%.]] ], [ [[IV_NEXT:%.]], [[LOOP_LATCH:%.]] ]			; CHECK-NEXT: [[IV:%.]] = phi i64 [ 1, [[ENTRY:%.]] ], [ [[IV_NEXT:%.]], [[LOOP_LATCH:%.]] ]
	; CHECK-NEXT: [[FOR:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.]], [[LOOP_LATCH]] ]			; CHECK-NEXT: [[FOR:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.]], [[LOOP_LATCH]] ]
	; CHECK-NEXT: [[C_1:%.*]] = icmp ult i64 [[IV]], 500			; CHECK-NEXT: [[C_1:%.*]] = icmp ult i64 [[IV]], 500
	; CHECK-NEXT: br i1 [[C_1]], label [[IF_TRUEBB:%.]], label [[IF_FALSEBB:%.]]			; CHECK-NEXT: [[DOTFOR:%.*]] = select i1 [[C_1]], i32 20, i32 [[FOR]]
	; CHECK: if.truebb:
	; CHECK-NEXT: br label [[LOOP_LATCH]]
	; CHECK: if.falsebb:
	; CHECK-NEXT: br label [[LOOP_LATCH]]			; CHECK-NEXT: br label [[LOOP_LATCH]]
	; CHECK: loop.latch:			; CHECK: loop.latch:
	; CHECK-NEXT: [[FIRST_TIME_1:%.*]] = phi i32 [ 20, [[IF_TRUEBB]] ], [ [[FOR]], [[IF_FALSEBB]] ]
	; CHECK-NEXT: [[C_2:%.*]] = icmp ult i64 [[IV]], 800			; CHECK-NEXT: [[C_2:%.*]] = icmp ult i64 [[IV]], 800
	; CHECK-NEXT: [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[FIRST_TIME_1]]			; CHECK-NEXT: [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[DOTFOR]]
	; CHECK-NEXT: [[PTR_IDX:%.]] = getelementptr i32, i32 [[PTR:%.*]], i64 [[IV]]			; CHECK-NEXT: [[PTR_IDX:%.]] = getelementptr i32, i32 [[PTR:%.*]], i64 [[IV]]
	; CHECK-NEXT: store i32 [[FOR_NEXT]], i32* [[PTR_IDX]], align 4			; CHECK-NEXT: store i32 [[FOR_NEXT]], i32* [[PTR_IDX]], align 4
	; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
	; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000			; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
	; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]			; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
	; CHECK: exit:			; CHECK: exit:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 197 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll

	Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines

	for.end: ; preds = %for.body			for.end: ; preds = %for.body
	ret void			ret void
	}			}

	; if (b[i] == k)			; if (b[i] == k)
	; a = ntrunc			; a = ntrunc
	; else a = k;			; else a = k;
	; TODO: We could vectorize this once we support multiple uniform stores to the			; The function gets vectorized by sinking the conditional stores in the successor.
	; same address.
	; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(			; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
	; CHECK-NOT: load <4 x i32>			; CHECK: load <4 x i32>
	define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {			define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
	entry:			entry:
	%ntrunc = trunc i64 %n to i32			%ntrunc = trunc i64 %n to i32
	br label %for.body			br label %for.body

	for.body: ; preds = %for.body, %entry			for.body: ; preds = %for.body, %entry
	%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]			%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
	%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i			%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
	▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]			; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
	; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]			; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]
	; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 8			; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 8
	; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
	; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4			; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4
	; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.]], label [[COND_STORE_K:%.]]
	; CHECK: cond_store:
	; CHECK-NEXT: br label [[LATCH]]
	; CHECK: cond_store_k:
	; CHECK-NEXT: br label [[LATCH]]
	; CHECK: latch:			; CHECK: latch:
	; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]			; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
				; CHECK-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP]], i32 [[NTRUNC]], i32 [[K]]
	; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4			; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4
	; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1			; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
	; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]			; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
	; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]			; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
	; CHECK: for.end.loopexit:			; CHECK: for.end.loopexit:
	; CHECK-NEXT: br label [[FOR_END]]			; CHECK-NEXT: br label [[FOR_END]]
	; CHECK: for.end:			; CHECK: for.end:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	Show All 31 Lines
	; invariant val stored to invariant address predicated on invariant condition			; invariant val stored to invariant address predicated on invariant condition
	; This is not treated as a predicated store since the block the store belongs to			; This is not treated as a predicated store since the block the store belongs to
	; is the latch block (which doesn't need to be predicated).			; is the latch block (which doesn't need to be predicated).
	; variant/invariant values being stored to invariant address.			; variant/invariant values being stored to invariant address.
	; test checks that the last element of the phi is extracted and scalar stored			; test checks that the last element of the phi is extracted and scalar stored
	; into the uniform address within the loop.			; into the uniform address within the loop.
	; Since the condition and the phi is loop invariant, they are LICM'ed after			; Since the condition and the phi is loop invariant, they are LICM'ed after
	; vectorization.			; vectorization.
	; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv			; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[NTRUNC:%.]] = trunc i64 [[N:%.]] to i32			; CHECK-NEXT: [[NTRUNC:%.]] = trunc i64 [[N:%.]] to i32
	; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[NTRUNC]], [[K:%.]]			; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[NTRUNC]], [[K:%.]]
	; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)			; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
	; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4			; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4
	; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_MEMCHECK:%.]]			; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_MEMCHECK:%.]]
	; CHECK: vector.memcheck:			; CHECK: vector.memcheck:
	; CHECK-NEXT: [[A4:%.]] = bitcast i32 [[A:%.]] to i8			; CHECK-NEXT: [[A3:%.]] = bitcast i32 [[A:%.]] to i8
	; CHECK-NEXT: [[B1:%.]] = bitcast i32 [[B:%.]] to i8			; CHECK-NEXT: [[B1:%.]] = bitcast i32 [[B:%.]] to i8
	; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)			; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
	; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i32, i32 [[B]], i64 [[SMAX2]]			; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i32, i32 [[B]], i64 [[SMAX]]
	; CHECK-NEXT: [[UGLYGEP:%.]] = getelementptr i8, i8 [[A4]], i64 1			; CHECK-NEXT: [[UGLYGEP:%.]] = getelementptr i8, i8 [[A3]], i64 1
	; CHECK-NEXT: [[BOUND0:%.]] = icmp ugt i8 [[UGLYGEP]], [[B1]]			; CHECK-NEXT: [[BOUND0:%.]] = icmp ugt i8 [[UGLYGEP]], [[B1]]
	; CHECK-NEXT: [[BOUND1:%.]] = icmp ugt i32 [[SCEVGEP]], [[A]]			; CHECK-NEXT: [[BOUND1:%.]] = icmp ugt i32 [[SCEVGEP]], [[A]]
	; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]			; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
	; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]			; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
	; CHECK: vector.ph:			; CHECK: vector.ph:
	; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804			; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804
	; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0			; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0
	; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer			; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
	; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3			; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3			; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[CMP]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
	; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]]			; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
	; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]			; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
	; CHECK: vector.body:			; CHECK: vector.body:
	; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[TMP6:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[INDEX]]			; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[INDEX]]
	; CHECK-NEXT: [[TMP7:%.]] = bitcast i32 [[TMP6]] to <4 x i32>*			; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[TMP2]] to <4 x i32>*
	; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4			; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !36, !noalias !39
	; CHECK-NEXT: store i32 [[TMP5]], i32* [[A]], align 4			; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !39
	; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4			; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
	; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]			; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]			; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
	; CHECK: middle.block:			; CHECK: middle.block:
	; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]			; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]]
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]			; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
	; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]			; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]
	; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4			; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4
	; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.]], label [[COND_STORE_K:%.]]
	; CHECK: cond_store:
	; CHECK-NEXT: br label [[LATCH]]
	; CHECK: cond_store_k:
	; CHECK-NEXT: br label [[LATCH]]			; CHECK-NEXT: br label [[LATCH]]
	; CHECK: latch:			; CHECK: latch:
	; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]			; CHECK-NEXT: store i32 [[K]], i32* [[A]], align 4
	; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4
	; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1			; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
	; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]			; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
	; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]			; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP42:![0-9]+]]
	; CHECK: for.end.loopexit:			; CHECK: for.end.loopexit:
	; CHECK-NEXT: br label [[FOR_END]]			; CHECK-NEXT: br label [[FOR_END]]
	; CHECK: for.end:			; CHECK: for.end:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {			define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
	entry:			entry:
	%ntrunc = trunc i64 %n to i32			%ntrunc = trunc i64 %n to i32
	Show All 25 Lines
	}			}

	; variant value stored to uniform address tests that the code gen extracts the			; variant value stored to uniform address tests that the code gen extracts the
	; last element from the variant vector and scalar stores it into the uniform			; last element from the variant vector and scalar stores it into the uniform
	; address.			; address.
	define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {			define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
	; CHECK-LABEL: @variant_val_store_to_inv_address(			; CHECK-LABEL: @variant_val_store_to_inv_address(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)			; CHECK-NEXT: [[SMAX4:%.]] = call i64 @llvm.smax.i64(i64 [[N:%.]], i64 1)
	; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4			; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4
	; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_MEMCHECK:%.]]			; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_MEMCHECK:%.]]
	; CHECK: vector.memcheck:			; CHECK: vector.memcheck:
	; CHECK-NEXT: [[B2:%.]] = bitcast i32 [[B:%.]] to i8			; CHECK-NEXT: [[B2:%.]] = bitcast i32 [[B:%.]] to i8
	; CHECK-NEXT: [[A1:%.]] = bitcast i32 [[A:%.]] to i8			; CHECK-NEXT: [[A1:%.]] = bitcast i32 [[A:%.]] to i8
	; CHECK-NEXT: [[UGLYGEP:%.]] = getelementptr i8, i8 [[A1]], i64 1			; CHECK-NEXT: [[UGLYGEP:%.]] = getelementptr i8, i8 [[A1]], i64 1
	; CHECK-NEXT: [[SMAX3:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)			; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
	; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i32, i32 [[B]], i64 [[SMAX3]]			; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i32, i32 [[B]], i64 [[SMAX]]
	; CHECK-NEXT: [[BOUND0:%.]] = icmp ugt i32 [[SCEVGEP]], [[A]]			; CHECK-NEXT: [[BOUND0:%.]] = icmp ugt i32 [[SCEVGEP]], [[A]]
	; CHECK-NEXT: [[BOUND1:%.]] = icmp ugt i8 [[UGLYGEP]], [[B2]]			; CHECK-NEXT: [[BOUND1:%.]] = icmp ugt i8 [[UGLYGEP]], [[B2]]
	; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]			; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
	; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]			; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
	; CHECK: vector.ph:			; CHECK: vector.ph:
	; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804			; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804
	; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]			; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
	; CHECK: vector.body:			; CHECK: vector.body:
	; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[VEC_PHI:%.]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[VEC_PHI:%.]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[INDEX]]			; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[INDEX]]
	; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[TMP2]] to <4 x i32>*			; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[TMP0]] to <4 x i32>*
	; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 8, !alias.scope !36			; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 8, !alias.scope !43
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
	; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36			; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !46, !noalias !43
	; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]			; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
	; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4			; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
	; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]			; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.]]			; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
	; CHECK: middle.block:			; CHECK: middle.block:
	; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])			; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
	; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]			; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]]
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]
	; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]			; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]			; CHECK-NEXT: [[I:%.]] = phi i64 [ [[I_NEXT:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
	; CHECK-NEXT: [[TMP0:%.]] = phi i32 [ [[TMP3:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]			; CHECK-NEXT: [[TMP0:%.]] = phi i32 [ [[TMP3:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
	; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]			; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[B]], i64 [[I]]
	; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 8			; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 8
	; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4			; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4
	; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]]			; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
	; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1			; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
	; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]			; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
	; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.]], [[LOOP42:!llvm.loop !.]]			; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP49:![0-9]+]]
	; CHECK: for.end.loopexit:			; CHECK: for.end.loopexit:
	; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]			; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
	; CHECK-NEXT: br label [[FOR_END]]			; CHECK-NEXT: br label [[FOR_END]]
	; CHECK: for.end:			; CHECK: for.end:
	; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ]			; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ]
	; CHECK-NEXT: ret i32 [[RDX_LCSSA]]			; CHECK-NEXT: ret i32 [[RDX_LCSSA]]
	;			;
	entry:			entry:
	%ntrunc = trunc i64 %n to i32			%ntrunc = trunc i64 %n to i32
	%cmp = icmp eq i32 %ntrunc, %k			%cmp = icmp eq i32 %ntrunc, %k
	br label %for.body			br label %for.body

	for.body: ; preds = %for.body, %entry			for.body: ; preds = %for.body, %entry
	▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

	Show All 37 Lines

	while.end:			while.end:
	ret void			ret void
	}			}

	; Make sure a loop is successfully vectorized with fold-tail when the backedge			; Make sure a loop is successfully vectorized with fold-tail when the backedge
	; taken count is constant and used inside the loop. Issue revealed by D76992.			; taken count is constant and used inside the loop. Issue revealed by D76992.
	;			;
	define void @reuse_const_btc(i8* %A) optsize {			define void @reuse_const_btc(i8* %A, i32* noalias %B) optsize {
	; CHECK-LABEL: @reuse_const_btc			; CHECK-LABEL: @reuse_const_btc
	; CHECK: {{%.}} = icmp ule <4 x i32> {{%.}}, <i32 13, i32 13, i32 13, i32 13>			; CHECK: {{%.}} = icmp ule <4 x i32> {{%.}}, <i32 13, i32 13, i32 13, i32 13>
	; CHECK: {{%.}} = select <4 x i1> {{%.}}, <4 x i32> <i32 12, i32 12, i32 12, i32 12>, <4 x i32> <i32 13, i32 13, i32 13, i32 13>			; CHECK: {{%.}} = select <4 x i1> {{%.}}, <4 x i32> <i32 12, i32 12, i32 12, i32 12>, <4 x i32>
	;			;
	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	%riv = phi i32 [ 13, %entry ], [ %rivMinus1, %merge ]			%riv = phi i32 [ 13, %entry ], [ %rivMinus1, %merge ]
	%sub = sub nuw nsw i32 20, %riv			%sub = sub nuw nsw i32 20, %riv
	%arrayidx = getelementptr inbounds i8, i8* %A, i32 %sub			%arrayidx = getelementptr inbounds i8, i8* %A, i32 %sub
	%cond0 = icmp eq i32 %riv, 7			%cond0 = icmp eq i32 %riv, 7
	br i1 %cond0, label %then, label %else			br i1 %cond0, label %then, label %else
	then:			then:
				%B.gep = getelementptr inbounds i32, i32* %B, i32 %sub
				%lv = load i32, i32* %B.gep
	br label %merge			br label %merge
	else:			else:
	br label %merge			br label %merge
	merge:			merge:
	%blend = phi i32 [ 13, %then ], [ 12, %else ]			%blend = phi i32 [ %lv, %then ], [ 12, %else ]
	%trunc = trunc i32 %blend to i8			%trunc = trunc i32 %blend to i8
	store i8 %trunc, i8* %arrayidx, align 1			store i8 %trunc, i8* %arrayidx, align 1
	%rivMinus1 = add nuw nsw i32 %riv, -1			%rivMinus1 = add nuw nsw i32 %riv, -1
	%cond = icmp eq i32 %riv, 0			%cond = icmp eq i32 %riv, 0
	br i1 %cond, label %exit, label %loop			br i1 %cond, label %exit, label %loop

	exit:			exit:
	ret void			ret void
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Try to sink and hoist inside candidate loops for vectorization.
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 340881

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll

llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll

llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll

llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Try to sink and hoist inside candidate loops for vectorization.AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 340881

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll

llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll

llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll

llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

[LV] Try to sink and hoist inside candidate loops for vectorization.
AbandonedPublic