Diff 244536

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Show All 10 Lines
// vectorization passes.		// vectorization passes.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/VectorCombine.h"		#include "llvm/Transforms/Vectorize/VectorCombine.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"		#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
		#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"		#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"		#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Transforms/Vectorize.h"		#include "llvm/Transforms/Vectorize.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"

using namespace llvm;		using namespace llvm;
using namespace llvm::PatternMatch;		using namespace llvm::PatternMatch;

#define DEBUG_TYPE "vector-combine"		#define DEBUG_TYPE "vector-combine"
STATISTIC(NumVecCmp, "Number of vector compares formed");		STATISTIC(NumVecCmp, "Number of vector compares formed");
		STATISTIC(NumVecBO, "Number of vector binops formed");

static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {		static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
// Match a cmp with extracted vector operands.		// Match a cmp with extracted vector operands.
CmpInst::Predicate Pred;		CmpInst::Predicate Pred;
Instruction Ext0, Ext1;		Instruction Ext0, Ext1;
if (!match(&I, m_Cmp(Pred, m_Instruction(Ext0), m_Instruction(Ext1))))		if (!match(&I, m_Cmp(Pred, m_Instruction(Ext0), m_Instruction(Ext1))))
return false;		return false;

Show All 30 Lines	static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);
Value *VecCmp = IsFP ? Builder.CreateFCmp(Pred, V0, V1)		Value *VecCmp = IsFP ? Builder.CreateFCmp(Pred, V0, V1)
: Builder.CreateICmp(Pred, V0, V1);		: Builder.CreateICmp(Pred, V0, V1);
Value *Ext = Builder.CreateExtractElement(VecCmp, C);		Value *Ext = Builder.CreateExtractElement(VecCmp, C);
I.replaceAllUsesWith(Ext);		I.replaceAllUsesWith(Ext);
return true;		return true;
}		}

		/// Try to reduce extract element costs by converting scalar binops to vector
		/// binops followed by extract.
		static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) {
		// It is not safe to transform things like div, urem, etc. because we may
		// create undefined behavior when executing those on unknown vector elements.
		if (!isSafeToSpeculativelyExecute(&I))
		return false;

		// Match a scalar binop with extracted vector operands:
		lebedev.riUnsubmitted Done Reply Inline Actions Hm, and if there is only one extract, and the other operand is constant, we consider that vectorizing the constant in some way will always be costlier? Just thinking out loud, not for this patch. lebedev.ri: Hm, and if there is only one extract, and the other operand is constant, we consider that…
		spatelAuthorUnsubmitted Done Reply Inline Actions This would imply that the vector binop is cheaper than the scalar binop since we would have the same extract either way. As I mentioned in an earlier comment, I haven't found that case yet on x86, but it's possible it exists for some binop on some subtarget. spatel: This would imply that the vector binop is cheaper than the scalar binop since we would have the…
		lebedev.riUnsubmitted Done Reply Inline Actions I'm mainly just trying to think of plausible edge cases that we might care about, but just didn't think of. lebedev.ri: I'm mainly just trying to think of plausible edge cases that we might care about, but just…
		// bo (extelt X, C0), (extelt Y, C1)
		Instruction Ext0, Ext1;
		if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1))))
		return false;

		Value X, Y;
		uint64_t C0, C1;
		if (!match(Ext0, m_ExtractElement(m_Value(X), m_ConstantInt(C0))) \|\|
		!match(Ext1, m_ExtractElement(m_Value(Y), m_ConstantInt(C1))) \|\|
		X->getType() != Y->getType())
		return false;

		// Check if using a vector binop would be cheaper.
		Instruction::BinaryOps BOpcode = cast<BinaryOperator>(I).getOpcode();
		lebedev.riUnsubmitted Done Reply Inline Actions But even if it's the same extract, do we not care whether it will go away or not, unlike the case with two different extracts? I.e. for now i'd expect if (!(Ext0 == Ext1 && Ext0->hasNUses(2)) && !(Ext0->hasOneUse() && Ext1->hasOneUse())) return false; lebedev.ri: But even if it's the same extract, do we not care whether it will go away or not, unlike the…
		lebedev.riUnsubmitted Done Reply Inline Actions (it might be better to handle extract cost from the getgo?) lebedev.ri: (it might be better to handle extract cost from the getgo?)
		spatelAuthorUnsubmitted Done Reply Inline Actions Yes, identical operands creates a loophole that might allow vectorization where it wasn't intended, so we should include it in this patch rather than making a follow-on. I've added a pile of tests that hopefully check all of the possibilities now. I don't see any x86 combos where the vector op is cheaper than the sibling scalar op, so those will all be negative tests. spatel: Yes, identical operands creates a loophole that might allow vectorization where it wasn't…
		Type *ScalarTy = I.getType();
		Type *VecTy = X->getType();
		lebedev.riUnsubmitted Done Reply Inline Actions Don't you want to ensure that both `X` and `Y` have the same type? (we didn't forget that in `foldExtractCmp()`) lebedev.ri: Don't you want to ensure that both `X` and `Y` have the same type? (we didn't forget that in…
		spatelAuthorUnsubmitted Done Reply Inline Actions Yes - good catch. Accidentally dropped that check. spatel: Yes - good catch. Accidentally dropped that check.
		int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy);
		int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy);
		int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
		VecTy, C0);
		int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
		VecTy, C1);

		// Handle a special case - if the extract indexes are the same, the
		// replacement sequence does not require a shuffle. Unless the vector binop is
		// much more expensive than the scalar binop, this eliminates an extract.
		// Extra uses of the extracts mean that we include those costs in the
		// vector total because those instructions will not be eliminated.
		if (C0 == C1) {
		assert(Extract0Cost == Extract1Cost && "Different costs for same extract?");
		lebedev.riUnsubmitted Done Reply Inline Actions If we are extracting the same lane, the cost should be identical right? I wonder if it would be more readable to assert that, do `int ExtractCost = Extract0Cost;` and operate on it here. lebedev.ri: If we are extracting the same lane, the cost should be identical right? I wonder if it would be…
		spatelAuthorUnsubmitted Done Reply Inline Actions Yes, this is a leftover of starting the patch on the more general case. I'll change it. spatel: Yes, this is a leftover of starting the patch on the more general case. I'll change it.
		int ExtractCost = Extract0Cost;
		if (X != Y) {
		int ScalarCost = ExtractCost + ExtractCost + ScalarBOCost;
		int VecCost = VecBOCost + ExtractCost +
		!Ext0->hasOneUse() * ExtractCost +
		!Ext1->hasOneUse() * ExtractCost;
		if (ScalarCost <= VecCost)
		return false;
		} else {
		// Handle an extra-special case. If the 2 binop operands are identical,
		// adjust the formulas to account for that:
		// bo (extelt X, C), (extelt X, C) --> extelt (bo X, X), C
		// The extra use charge allows for either the CSE'd pattern or an
		lebedev.riUnsubmitted Done Reply Inline Actions I'd think it would be more obvious to do `!Ext0->hasNUses(2)` instead. It can't have less than two uses - there are two uses in our entry binop. And if there are more than two uses then those are The extra uses. Otherwise, why do we check `hasOneUse()` instead of `hasNUsesOrMore(2)`? lebedev.ri: I'd think it would be more obvious to do `!Ext0->hasNUses(2)` instead. It can't have less than…
		spatelAuthorUnsubmitted Done Reply Inline Actions I agree that your suggestion is better for readability. Will change. Thanks for the detailed review! spatel: I agree that your suggestion is better for readability. Will change. Thanks for the detailed…
		lebedev.riUnsubmitted Not Done Reply Inline Actions Otherwise, why do we check hasOneUse() instead of hasNUsesOrMore(2)? Err, that of course should have read "Otherwise, why do we check hasOneUse() instead of hasNUsesOrMore(1)" lebedev.ri: > Otherwise, why do we check hasOneUse() instead of hasNUsesOrMore(2)? Err, that of course…
		// unoptimized form with identical values.
		bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
		: !Ext0->hasOneUse() \|\| !Ext1->hasOneUse();
		lebedev.riUnsubmitted Done Reply Inline Actions Thinking out loud: what about the case where `X == Y`? If the extract is of the same element, could consider doing extract from `2X` or `X+X` If elements are different, could consider forming `hadd` lebedev.ri:* Thinking out loud: what about the case where `X == Y`? * If the extract is of the same element…
		spatelAuthorUnsubmitted Done Reply Inline Actions For now (with same element extraction), it should work as expected. I'll add a test. I don't think we want to attempt canonicalization like X+X --> X<<1 if that's what you're thinking of. Let's leave that to instcombine. With different elements, we'll want to be careful that we are not obscuring a pattern that the backend recognizes. I think we're ok on that either way for x86 horizontal math, but I'll take a closer look before the planned enhancement. spatel: For now (with same element extraction), it should work as expected. I'll add a test. I don't…
		int ScalarCost = ExtractCost + ScalarBOCost;
		int VecCost = VecBOCost + ExtractCost + HasUseTax * ExtractCost;
		if (ScalarCost <= VecCost)
		return false;
		}

		// bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C
		++NumVecBO;
		IRBuilder<> Builder(&I);
		Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y);
		if (auto *VecBOInst = dyn_cast<Instruction>(NewBO)) {
		// All IR flags are safe to back-propagate because any potential poison
		// created in unused vector elements is discarded by the extract.
		VecBOInst->copyIRFlags(&I);
		}
		Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1));
		I.replaceAllUsesWith(Extract);
		return true;
		}

		// TODO: Handle C0 != C1 by shuffling 1 of the operands.
		return false;
		}

/// This is the entry point for all transforms. Pass manager differences are		/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.		/// handled in the callers of this function.
static bool runImpl(Function &F, const TargetTransformInfo &TTI,		static bool runImpl(Function &F, const TargetTransformInfo &TTI,
const DominatorTree &DT) {		const DominatorTree &DT) {
bool MadeChange = false;		bool MadeChange = false;
for (BasicBlock &BB : F) {		for (BasicBlock &BB : F) {
// Ignore unreachable basic blocks.		// Ignore unreachable basic blocks.
if (!DT.isReachableFromEntry(&BB))		if (!DT.isReachableFromEntry(&BB))
continue;		continue;
// Do not delete instructions under here and invalidate the iterator.		// Do not delete instructions under here and invalidate the iterator.
// Walk the block backwards for efficiency. We're matching a chain of		// Walk the block backwards for efficiency. We're matching a chain of
// use->defs, so we're more likely to succeed by starting from the bottom.		// use->defs, so we're more likely to succeed by starting from the bottom.
// TODO: It could be more efficient to remove dead instructions		// TODO: It could be more efficient to remove dead instructions
// iteratively in this loop rather than waiting until the end.		// iteratively in this loop rather than waiting until the end.
for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {		for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
MadeChange \|= foldExtractCmp(I, TTI);		MadeChange \|= foldExtractCmp(I, TTI);
// TODO: More transforms go here.		MadeChange \|= foldExtractBinop(I, TTI);
}		}
}		}

// We're done with transforms, so remove dead instructions.		// We're done with transforms, so remove dead instructions.
if (MadeChange)		if (MadeChange)
for (BasicBlock &BB : F)		for (BasicBlock &BB : F)
SimplifyInstructionsInBlock(&BB);		SimplifyInstructionsInBlock(&BB);

▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- \| FileCheck %s			; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- \| FileCheck %s

				; Eliminating extract is profitable.

	define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext0_ext0_add(			; CHECK-LABEL: @ext0_ext0_add(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[TMP1:%.]] = add <16 x i8> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: ret i8 [[TMP2]]
	; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	%e1 = extractelement <16 x i8> %y, i32 0			%e1 = extractelement <16 x i8> %y, i32 0
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Eliminating extract is still profitable. Flags propagate.

	define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext1_ext1_add_flags(			; CHECK-LABEL: @ext1_ext1_add_flags(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 1			; CHECK-NEXT: [[TMP1:%.]] = add nuw nsw <16 x i8> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 1			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
	; CHECK-NEXT: [[R:%.*]] = add nuw nsw i8 [[E0]], [[E1]]			; CHECK-NEXT: ret i8 [[TMP2]]
	; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 1			%e0 = extractelement <16 x i8> %x, i32 1
	%e1 = extractelement <16 x i8> %y, i32 1			%e1 = extractelement <16 x i8> %y, i32 1
	%r = add nsw nuw i8 %e0, %e1			%r = add nsw nuw i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - eliminating extract is profitable, but vector shift is expensive.

	define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext1_ext1_shl(			; CHECK-LABEL: @ext1_ext1_shl(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 1			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 1
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 1			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 1
	; CHECK-NEXT: [[R:%.*]] = shl i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = shl i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 1			%e0 = extractelement <16 x i8> %x, i32 1
	%e1 = extractelement <16 x i8> %y, i32 1			%e1 = extractelement <16 x i8> %y, i32 1
	%r = shl i8 %e0, %e1			%r = shl i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - eliminating extract is profitable, but vector multiply is expensive.

	define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext13_ext13_mul(			; CHECK-LABEL: @ext13_ext13_mul(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 13			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 13
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 13			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 13
	; CHECK-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 13			%e0 = extractelement <16 x i8> %x, i32 13
	%e1 = extractelement <16 x i8> %y, i32 13			%e1 = extractelement <16 x i8> %y, i32 13
	%r = mul i8 %e0, %e1			%r = mul i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - cost is irrelevant because sdiv has potential UB.

	define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext0_ext0_sdiv(			; CHECK-LABEL: @ext0_ext0_sdiv(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0
	; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	%e1 = extractelement <16 x i8> %y, i32 0			%e1 = extractelement <16 x i8> %y, i32 0
	%r = sdiv i8 %e0, %e1			%r = sdiv i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - extracts are free and vector op has same cost as scalar.

	define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {			define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
	; CHECK-LABEL: @ext0_ext0_fadd(			; CHECK-LABEL: @ext0_ext0_fadd(
	; CHECK-NEXT: [[E0:%.]] = extractelement <2 x double> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <2 x double> [[X:%.]], i32 0
	; CHECK-NEXT: [[E1:%.]] = extractelement <2 x double> [[Y:%.]], i32 0			; CHECK-NEXT: [[E1:%.]] = extractelement <2 x double> [[Y:%.]], i32 0
	; CHECK-NEXT: [[R:%.*]] = fadd double [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = fadd double [[E0]], [[E1]]
	; CHECK-NEXT: ret double [[R]]			; CHECK-NEXT: ret double [[R]]
	;			;
	%e0 = extractelement <2 x double> %x, i32 0			%e0 = extractelement <2 x double> %x, i32 0
	%e1 = extractelement <2 x double> %y, i32 0			%e1 = extractelement <2 x double> %y, i32 0
	%r = fadd double %e0, %e1			%r = fadd double %e0, %e1
	ret double %r			ret double %r
	}			}

				; Eliminating extract is profitable. Flags propagate.

	define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {			define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {
	; CHECK-LABEL: @ext1_ext1_fsub(			; CHECK-LABEL: @ext1_ext1_fsub(
	; CHECK-NEXT: [[E0:%.]] = extractelement <2 x double> [[X:%.]], i32 1			; CHECK-NEXT: [[TMP1:%.]] = fsub fast <2 x double> [[X:%.]], [[Y:%.*]]
	; CHECK-NEXT: [[E1:%.]] = extractelement <2 x double> [[Y:%.]], i32 1			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
	; CHECK-NEXT: [[R:%.*]] = fsub fast double [[E0]], [[E1]]			; CHECK-NEXT: ret double [[TMP2]]
	; CHECK-NEXT: ret double [[R]]
	;			;
	%e0 = extractelement <2 x double> %x, i32 1			%e0 = extractelement <2 x double> %x, i32 1
	%e1 = extractelement <2 x double> %y, i32 1			%e1 = extractelement <2 x double> %y, i32 1
	%r = fsub fast double %e0, %e1			%r = fsub fast double %e0, %e1
	ret double %r			ret double %r
	}			}

				; Negative test - type mismatch.

	define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y) {			define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y) {
	; CHECK-LABEL: @ext1_ext1_fadd_different_types(			; CHECK-LABEL: @ext1_ext1_fadd_different_types(
	; CHECK-NEXT: [[E0:%.]] = extractelement <2 x double> [[X:%.]], i32 1			; CHECK-NEXT: [[E0:%.]] = extractelement <2 x double> [[X:%.]], i32 1
	; CHECK-NEXT: [[E1:%.]] = extractelement <4 x double> [[Y:%.]], i32 1			; CHECK-NEXT: [[E1:%.]] = extractelement <4 x double> [[Y:%.]], i32 1
	; CHECK-NEXT: [[R:%.*]] = fadd fast double [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = fadd fast double [[E0]], [[E1]]
	; CHECK-NEXT: ret double [[R]]			; CHECK-NEXT: ret double [[R]]
	;			;
	%e0 = extractelement <2 x double> %x, i32 1			%e0 = extractelement <2 x double> %x, i32 1
	%e1 = extractelement <4 x double> %y, i32 1			%e1 = extractelement <4 x double> %y, i32 1
	%r = fadd fast double %e0, %e1			%r = fadd fast double %e0, %e1
	ret double %r			ret double %r
	}			}

				; Negative test - disguised same vector operand; scalar code is cheaper than general case.

	define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {			define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
	; CHECK-LABEL: @ext1_ext1_add_same_vec(			; CHECK-LABEL: @ext1_ext1_add_same_vec(
	; CHECK-NEXT: [[E0:%.]] = extractelement <4 x i32> [[X:%.]], i32 1			; CHECK-NEXT: [[E0:%.]] = extractelement <4 x i32> [[X:%.]], i32 1
	; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[X]], i32 1			; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[X]], i32 1
	; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E1]]
	; CHECK-NEXT: ret i32 [[R]]			; CHECK-NEXT: ret i32 [[R]]
	;			;
	%e0 = extractelement <4 x i32> %x, i32 1			%e0 = extractelement <4 x i32> %x, i32 1
	%e1 = extractelement <4 x i32> %x, i32 1			%e1 = extractelement <4 x i32> %x, i32 1
	%r = add i32 %e0, %e1			%r = add i32 %e0, %e1
	ret i32 %r			ret i32 %r
	}			}

				; Negative test - same vector operand; scalar code is cheaper than general case.

	define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {			define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {
	; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(			; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(
	; CHECK-NEXT: [[E0:%.]] = extractelement <4 x i32> [[X:%.]], i32 1			; CHECK-NEXT: [[E0:%.]] = extractelement <4 x i32> [[X:%.]], i32 1
	; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E0]]			; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E0]]
	; CHECK-NEXT: ret i32 [[R]]			; CHECK-NEXT: ret i32 [[R]]
	;			;
	%e0 = extractelement <4 x i32> %x, i32 1			%e0 = extractelement <4 x i32> %x, i32 1
	%r = add i32 %e0, %e0			%r = add i32 %e0, %e0
	ret i32 %r			ret i32 %r
	}			}

	declare void @use_i8(i8)			declare void @use_i8(i8)

				; Negative test - same vector operand; scalar code is cheaper than general case
				; and vector code would be more expensive still.

	define i8 @ext1_ext1_add_same_vec_extra_use0(<16 x i8> %x) {			define i8 @ext1_ext1_add_same_vec_extra_use0(<16 x i8> %x) {
	; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use0(			; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use0(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: call void @use_i8(i8 [[E0]])			; CHECK-NEXT: call void @use_i8(i8 [[E0]])
	; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0			; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	call void @use_i8(i8 %e0)			call void @use_i8(i8 %e0)
	%e1 = extractelement <16 x i8> %x, i32 0			%e1 = extractelement <16 x i8> %x, i32 0
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - same vector operand; scalar code is cheaper than general case
				; and vector code would be more expensive still.

	define i8 @ext1_ext1_add_same_vec_extra_use1(<16 x i8> %x) {			define i8 @ext1_ext1_add_same_vec_extra_use1(<16 x i8> %x) {
	; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use1(			; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use1(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0			; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
	; CHECK-NEXT: call void @use_i8(i8 [[E1]])			; CHECK-NEXT: call void @use_i8(i8 [[E1]])
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	%e1 = extractelement <16 x i8> %x, i32 0			%e1 = extractelement <16 x i8> %x, i32 0
	call void @use_i8(i8 %e1)			call void @use_i8(i8 %e1)
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - same vector operand; scalar code is cheaper than general case
				; and vector code would be more expensive still.

	define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {			define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {
	; CHECK-LABEL: @ext1_ext1_add_same_vec_cse_extra_use(			; CHECK-LABEL: @ext1_ext1_add_same_vec_cse_extra_use(
	; CHECK-NEXT: [[E:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: call void @use_i8(i8 [[E]])			; CHECK-NEXT: call void @use_i8(i8 [[E]])
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E]], [[E]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E]], [[E]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e = extractelement <16 x i8> %x, i32 0			%e = extractelement <16 x i8> %x, i32 0
	call void @use_i8(i8 %e)			call void @use_i8(i8 %e)
	%r = add i8 %e, %e			%r = add i8 %e, %e
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - vector code would not be cheaper.

	define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext1_ext1_add_uses1(			; CHECK-LABEL: @ext1_ext1_add_uses1(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: call void @use_i8(i8 [[E0]])			; CHECK-NEXT: call void @use_i8(i8 [[E0]])
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	call void @use_i8(i8 %e0)			call void @use_i8(i8 %e0)
	%e1 = extractelement <16 x i8> %y, i32 0			%e1 = extractelement <16 x i8> %y, i32 0
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; Negative test - vector code would not be cheaper.

	define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext1_ext1_add_uses2(			; CHECK-LABEL: @ext1_ext1_add_uses2(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 0
	; CHECK-NEXT: call void @use_i8(i8 [[E1]])			; CHECK-NEXT: call void @use_i8(i8 [[E1]])
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	%e1 = extractelement <16 x i8> %y, i32 0			%e1 = extractelement <16 x i8> %y, i32 0
	call void @use_i8(i8 %e1)			call void @use_i8(i8 %e1)
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

				; TODO: Different extract indexes requires a shuffle.

	define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {			define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: @ext0_ext1_add(			; CHECK-LABEL: @ext0_ext1_add(
	; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0			; CHECK-NEXT: [[E0:%.]] = extractelement <16 x i8> [[X:%.]], i32 0
	; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 1			; CHECK-NEXT: [[E1:%.]] = extractelement <16 x i8> [[Y:%.]], i32 1
	; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]			; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
	; CHECK-NEXT: ret i8 [[R]]			; CHECK-NEXT: ret i8 [[R]]
	;			;
	%e0 = extractelement <16 x i8> %x, i32 0			%e0 = extractelement <16 x i8> %x, i32 0
	%e1 = extractelement <16 x i8> %y, i32 1			%e1 = extractelement <16 x i8> %y, i32 1
	%r = add i8 %e0, %e1			%r = add i8 %e0, %e1
	ret i8 %r			ret i8 %r
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[VectorCombine] try to form vector binop to eliminate an extract element
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 244536

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

This is an archive of the discontinued LLVM Phabricator instance.

[VectorCombine] try to form vector binop to eliminate an extract elementClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 244536

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

[VectorCombine] try to form vector binop to eliminate an extract element
ClosedPublic