Diff 27923

include/llvm/CodeGen/Passes.h

Show First 20 Lines • Show All 631 Lines • ▼ Show 20 Lines	/// MachineDominanaceFrontier - This pass is a machine dominators analysis pass.
extern char &StackMapLivenessID;		extern char &StackMapLivenessID;

/// createJumpInstrTables - This pass creates jump-instruction tables.		/// createJumpInstrTables - This pass creates jump-instruction tables.
ModulePass *createJumpInstrTablesPass();		ModulePass *createJumpInstrTablesPass();

/// createForwardControlFlowIntegrityPass - This pass adds control-flow		/// createForwardControlFlowIntegrityPass - This pass adds control-flow
/// integrity.		/// integrity.
ModulePass *createForwardControlFlowIntegrityPass();		ModulePass *createForwardControlFlowIntegrityPass();

		/// InterleavedAccess Pass - This pass identifies and matches interleaved
		/// memory accesses to target specific intrinsics.
		///
		FunctionPass createInterleavedAccessPass(const TargetMachine TM);
} // End llvm namespace		} // End llvm namespace

/// Target machine pass initializer for passes with dependencies. Use with		/// Target machine pass initializer for passes with dependencies. Use with
/// INITIALIZE_TM_PASS_END.		/// INITIALIZE_TM_PASS_END.
#define INITIALIZE_TM_PASS_BEGIN INITIALIZE_PASS_BEGIN		#define INITIALIZE_TM_PASS_BEGIN INITIALIZE_PASS_BEGIN

/// Target machine pass initializer for passes with dependencies. Use with		/// Target machine pass initializer for passes with dependencies. Use with
/// INITIALIZE_TM_PASS_BEGIN.		/// INITIALIZE_TM_PASS_BEGIN.
Show All 21 Lines

include/llvm/Target/TargetLowering.h

Show First 20 Lines • Show All 1,591 Lines • ▼ Show 20 Lines	virtual bool hasPairedLoad(Type * /LoadedType/,
return false;		return false;
}		}

virtual bool hasPairedLoad(EVT /LoadedType/,		virtual bool hasPairedLoad(EVT /LoadedType/,
unsigned & /RequiredAligment/) const {		unsigned & /RequiredAligment/) const {
return false;		return false;
}		}

		/// \brief Lower an interleaved load to target specific intrinsics. Return
		/// true on success.
		///
		/// \p LI is the vector load instruction.
		/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
		/// \p Indices is the corresponding indices for each shufflevector.
		/// \p Factor is the interleave factor.
		virtual bool lowerInterleavedLoad(LoadInst *LI,
		ArrayRef<ShuffleVectorInst *> Shuffles,
		ArrayRef<unsigned> Indices,
		unsigned Factor) const {
		return false;
		}

		/// \brief Lower an interleaved store to target specific intrinsics. Return
		/// true on success.
		///
		/// \p SI is the vector store instruction.
		/// \p SVI is the shufflevector to RE-interleave the stored vector.
		/// \p Factor is the interleave factor.
		virtual bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
		unsigned Factor) const {
		return false;
		}

/// Return true if zero-extending the specific node Val to type VT2 is free		/// Return true if zero-extending the specific node Val to type VT2 is free
/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or		/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
/// because it's folded such as X86 zero-extending loads).		/// because it's folded such as X86 zero-extending loads).
virtual bool isZExtFree(SDValue Val, EVT VT2) const {		virtual bool isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);		return isZExtFree(Val.getValueType(), VT2);
}		}

/// Return true if an fpext operation is free (for instance, because		/// Return true if an fpext operation is free (for instance, because
▲ Show 20 Lines • Show All 1,199 Lines • Show Last 20 Lines

lib/CodeGen/CMakeLists.txt

Show All 24 Lines	add_llvm_library(LLVMCodeGen
GCMetadataPrinter.cpp		GCMetadataPrinter.cpp
GCRootLowering.cpp		GCRootLowering.cpp
GCStrategy.cpp		GCStrategy.cpp
GlobalMerge.cpp		GlobalMerge.cpp
IfConversion.cpp		IfConversion.cpp
ImplicitNullChecks.cpp		ImplicitNullChecks.cpp
InlineSpiller.cpp		InlineSpiller.cpp
InterferenceCache.cpp		InterferenceCache.cpp
		InterleavedAccessPass.cpp
IntrinsicLowering.cpp		IntrinsicLowering.cpp
LLVMTargetMachine.cpp		LLVMTargetMachine.cpp
LatencyPriorityQueue.cpp		LatencyPriorityQueue.cpp
LexicalScopes.cpp		LexicalScopes.cpp
LiveDebugVariables.cpp		LiveDebugVariables.cpp
LiveInterval.cpp		LiveInterval.cpp
LiveIntervalAnalysis.cpp		LiveIntervalAnalysis.cpp
LiveIntervalUnion.cpp		LiveIntervalUnion.cpp
▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

lib/CodeGen/InterleavedAccessPass.cpp

This file was added.

				//=----------------------- InterleavedAccessPass.cpp -----------------------==//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements the Interleaved Access pass, which identifies
				// interleaved memory accesses and transforms them into an target specific
				// intrinsics.
				//
				// An interleaved load reads data from memory into several vectors, with
				// DE-interleaving the data on a factor. An interleaved store writes several
				// vectors to memory with RE-interleaving the data on a factor. The interleave
				// factor is equal to the number of vectors.
				//
				// As interleaved accesses are hard to be identified in CodeGen (mainly because
				// the VECTOR_SHUFFLE DAG node is quite different from the shufflevector IR),
				// we identify and transform them to intrinsics in this pass. So that we
				// can easily match them into target specific instructions later in CodeGen.
				//
				// E.g. An interleaved load (Factor = 2):
				// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
				// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
				//
				// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
				// intrinsic in ARM backend.
				//
				// E.g. An interleaved store (Factor = 2):
				// %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> ; Interleaved vec
				// store <8 x i32> %i.vec, <8 x i32>* %ptr
				mzolotukhinUnsubmitted Not Done Reply Inline Actions How would IR look for 4 vectors? Will we have a shuffle of shuffles? mzolotukhin: How would IR look for 4 vectors? Will we have a shuffle of shuffles?
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions Will have a shuffle. E.g. An interleaved store of factor 4. %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v2, <0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15> store <16 x i32> %i.vec, <16 x i32>* %ptr %v0 and %v2 could be concatenated from other small vectors, like: %v0 = shuffle <4 x i32> %A, <4 x i32> %B, <0, 1, 2, 3, 4, 5, 6, 7> %v1 = shuffle <4 x i32> %C, <4 x i32> %D, <0, 1, 2, 3, 4, 5, 6, 7> but we only need to check the last shuffle with the RE-interleaved mask. HaoLiu: Will have a shuffle. E.g. An interleaved store of factor 4. %i.vec = shuffle <8 x i32>…
				//
				// It could be transformed into a st2 intrinsic in AArch64 backend or a vst2
				// intrinsic in ARM backend.
				//
				//===----------------------------------------------------------------------===//

				#include "llvm/CodeGen/Passes.h"
				#include "llvm/IR/InstIterator.h"
				#include "llvm/Support/Debug.h"
				#include "llvm/Support/MathExtras.h"
				#include "llvm/Target/TargetLowering.h"
				#include "llvm/Target/TargetSubtargetInfo.h"

				using namespace llvm;

				#define DEBUG_TYPE "interleaved-access"

				static const unsigned MIN_FACTOR = 2;
				static const unsigned MAX_FACTOR = 4;

				namespace llvm {
				static void initializeInterleavedAccessPass(PassRegistry &);
				}

				mzolotukhinUnsubmitted Not Done Reply Inline Actions Do these names comply with the coding standards? mzolotukhin: Do these names comply with the coding standards?
				namespace {

				class InterleavedAccess : public FunctionPass {

				public:
				static char ID;
				InterleavedAccess(const TargetMachine *TM = nullptr)
				: FunctionPass(ID), TM(TM), TLI(nullptr) {
				initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
				}

				const char *getPassName() const override { return "Interleaved Access Pass"; }

				bool runOnFunction(Function &F) override;

				private:
				const TargetMachine *TM;
				const TargetLowering *TLI;

				/// \brief Transform an interleaved load into target specific intrinsics.
				bool matchInterleavedLoad(LoadInst *LI,
				SmallVector<Instruction *, 32> &DeadInsts);

				/// \brief Transform an interleaved store into target specific intrinsics.
				bool matchInterleavedStore(StoreInst *SI,
				SmallVector<Instruction *, 32> &DeadInsts);
				};
				} // end anonymous namespace.

				char InterleavedAccess::ID = 0;
				INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
				"Match interleaved memory accesses to target specific intrinsics",
				false, false)

				FunctionPass llvm::createInterleavedAccessPass(const TargetMachine TM) {
				return new InterleavedAccess(TM);
				}

				/// \brief Check if the mask is a DE-interleave mask of the given factor
				/// \p Factor like:
				/// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
				static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
				unsigned &Index) {
				// Check all potential start indices from 0 to (Factor - 1).
				for (Index = 0; Index < Factor; Index++) {
				unsigned i = 0;

				// Check that elements are in ascending order by Factor.
				for (; i < Mask.size(); i++)
				if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
				break;

				if (i == Mask.size())
				return true;
				}

				rengolinUnsubmitted Not Done Reply Inline Actions If the mask index can't be negative, why use ArrayRef<int>? rengolin: If the mask index can't be negative, why use ArrayRef<int>?
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions It could be negative. When a mask is undef, it is -1. Here we only compare non-negative masks and ignore undef masks. HaoLiu: It could be negative. When a mask is undef, it is -1. Here we only compare non-negative masks…
				return false;
				}

				/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
				///
				/// E.g. DE-interleave masks (Factor = 2) could be:
				/// <0, 2, 4, 6> (mask of index 0 to extract even elements)
				/// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
				static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
				unsigned &Index) {
				if (Mask.size() < 2)
				return false;

				// Check potential Factors.
				for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
				if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
				return true;

				return false;
				}

				rengolinUnsubmitted Not Done Reply Inline Actions Checking for all factors "up to" in isDeInterleaveMaskOfFactor() is redundant with this line. Though, I see that you're using it in other functions that may need that functionality. Not sure how to split this, but it looks inefficient... rengolin: Checking for all factors "up to" in isDeInterleaveMaskOfFactor() is redundant with this line.
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions I merged isDeInterleaveMask() and isDeInterleaveMaskOfFactor() into one function isDeInterleaveMask(). HaoLiu: I merged isDeInterleaveMask() and isDeInterleaveMaskOfFactor() into one function…
				rengolinUnsubmitted Not Done Reply Inline Actions Hum, I'm still seeing isDeInterleaveMaskOfFactor in the latest patch... rengolin: Hum, I'm still seeing isDeInterleaveMaskOfFactor in the latest patch...
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions Sorry. I misunderstood and misleaded. I merged isReInterleaveMask() and isReInterleaveMaskOfFactor(). I cannot merge isDeInterleaveMask() and isDeInterleaveMaskOfFactor(), which are both used in lowerInterleavedLoad(). The former is used to check and find an interleave factor. The later is only used to check whether the given mask is the DE-interleaved of the given factor. HaoLiu: Sorry. I misunderstood and misleaded. I merged isReInterleaveMask() and…
				rengolinUnsubmitted Not Done Reply Inline Actions Right, I thought it was weird that you had merged them. :) rengolin: Right, I thought it was weird that you had merged them. :)
				/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given
				/// factor \p Factor.
				///
				/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
				static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
				unsigned NumElts = Mask.size();
				if (NumElts % Factor)
				return false;

				unsigned NumSubElts = NumElts / Factor;
				if (!isPowerOf2_32(NumSubElts))
				return false;

				for (unsigned i = 0; i < NumSubElts; i++)
				for (unsigned j = 0; j < Factor; j++)
				if (Mask[i * Factor + j] >= 0 &&
				static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts + i)
				return false;

				return true;
				}

				/// \brief Check if the mask is RE-interleave mask for an interleaved store.
				///
				/// E.g. The RE-interleave mask (Factor = 2) could be:
				/// <0, 4, 1, 5, 2, 6, 3, 7>
				static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
				if (Mask.size() < 4)
				return false;

				// Check potential Factors.
				for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
				if (isReInterleaveMaskOfFactor(Mask, Factor))
				return true;

				return false;
				}

				bool InterleavedAccess::matchInterleavedLoad(
				LoadInst LI, SmallVector<Instruction , 32> &DeadInsts) {
				if (!LI->isSimple())
				return false;

				SmallVector<ShuffleVectorInst *, 4> Shuffles;
				// Check if all users of this load are shufflevectors.
				for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
				ShuffleVectorInst SVI = dyn_cast<ShuffleVectorInst>(UI);
				if (!SVI \|\| !isa<UndefValue>(SVI->getOperand(1)))
				return false;

				Shuffles.push_back(SVI);
				}

				if (Shuffles.empty())
				return false;

				unsigned Factor, Index;

				// Check if the first shufflevector is DE-interleave shuffle.
				if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
				return false;

				// Holds the corresponding index for each DE-interleave shuffle.
				SmallVector<unsigned, 4> Indices;
				Indices.push_back(Index);

				Type *VecTy = Shuffles[0]->getType();

				// Check if other shufflevectors are also DE-interleaved of the same type
				// and factor as the first shufflevector.
				for (unsigned i = 1; i < Shuffles.size(); i++) {
				if (Shuffles[i]->getType() != VecTy)
				return false;

				if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
				Index))
				return false;

				Indices.push_back(Index);
				}

				DEBUG(dbgs() << "CGP: Found an interleaved load: " << *LI << "\n");

				// Try to create target specific intrinsics to replace the load and shuffles.
				if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
				return false;

				DEBUG(dbgs() << "CGP: Matched the interleaved load successfully.\n");
				rengolinUnsubmitted Not Done Reply Inline Actions CGP? rengolin: CGP?
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions Fixed. HaoLiu: Fixed.

				for (auto SVI : Shuffles)
				DeadInsts.push_back(SVI);

				DeadInsts.push_back(LI);
				return true;
				}

				bool InterleavedAccess::matchInterleavedStore(
				StoreInst SI, SmallVector<Instruction , 32> &DeadInsts) {
				if (!SI->isSimple())
				return false;

				ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
				if (!SVI \|\| !SVI->hasOneUse())
				return false;

				// Check if the shufflevector is RE-interleave shuffle.
				unsigned Factor;
				if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
				mzolotukhinUnsubmitted Not Done Reply Inline Actions Will it work for `Factor != 2`? If not, and other factors aren't supported for now, please add an explicit assert and TODO for it. If yes, should we also check the other shuffles? mzolotukhin: Will it work for `Factor != 2`? If not, and other factors aren't supported for now, please add…
				HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions Yes, it will work for other factor. As the previous example of factor 4, we only need to check the last shuffle with RE-interleaved mask. HaoLiu: Yes, it will work for other factor. As the previous example of factor 4, we only need to check…
				return false;

				DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");

				// Try to create target specific intrinsics to replace the store and shuffle.
				if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
				return false;

				DEBUG(dbgs() << "IA: Matched the interleaved store successfully.\n");

				// Already have a new target specific interleaved store. Erase the old store.
				DeadInsts.push_back(SI);
				DeadInsts.push_back(SVI);
				return true;
				}

				bool InterleavedAccess::runOnFunction(Function &F) {
				DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");

				if (!TM)
				return false;

				TLI = TM->getSubtargetImpl(F)->getTargetLowering();

				// Holds dead instructions that will be erased later.
				SmallVector<Instruction *, 32> DeadInsts;
				bool Changed = false;

				for (auto &I : inst_range(F)) {
				if (LoadInst *LI = dyn_cast<LoadInst>(&I))
				Changed \|= matchInterleavedLoad(LI, DeadInsts);

				if (StoreInst *SI = dyn_cast<StoreInst>(&I))
				Changed \|= matchInterleavedStore(SI, DeadInsts);
				}

				for (auto I : DeadInsts)
				I->eraseFromParent();

				return Changed;
				}

lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 304 Lines • ▼ Show 20 Lines	public:
bool isZExtFree(Type Ty1, Type Ty2) const override;		bool isZExtFree(Type Ty1, Type Ty2) const override;
bool isZExtFree(EVT VT1, EVT VT2) const override;		bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;		bool isZExtFree(SDValue Val, EVT VT2) const override;

bool hasPairedLoad(Type *LoadedType,		bool hasPairedLoad(Type *LoadedType,
unsigned &RequiredAligment) const override;		unsigned &RequiredAligment) const override;
bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;		bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;

		bool lowerInterleavedLoad(LoadInst *LI,
		ArrayRef<ShuffleVectorInst *> Shuffles,
		ArrayRef<unsigned> Indices,
		unsigned Factor) const override;
		bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
		unsigned Factor) const override;

bool isLegalAddImmediate(int64_t) const override;		bool isLegalAddImmediate(int64_t) const override;
bool isLegalICmpImmediate(int64_t) const override;		bool isLegalICmpImmediate(int64_t) const override;

EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,		EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,		bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const override;		MachineFunction &MF) const override;

/// isLegalAddressingMode - Return true if the addressing mode represented		/// isLegalAddressingMode - Return true if the addressing mode represented
▲ Show 20 Lines • Show All 207 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,820 Lines • ▼ Show 20 Lines	if (!LoadedType.isSimple() \|\|
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))		(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;		return false;
// Cyclone supports unaligned accesses.		// Cyclone supports unaligned accesses.
RequiredAligment = 0;		RequiredAligment = 0;
unsigned NumBits = LoadedType.getSizeInBits();		unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 \|\| NumBits == 64;		return NumBits == 32 \|\| NumBits == 64;
}		}

		/// \brief Lower an interleaved load to a ldN intrinsic.
		///
		// E.g. Lower an interleaved load (Factor = 2):
		// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
		// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
		// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
		// Into:
		// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
		// %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
		// %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
		bool AArch64TargetLowering::lowerInterleavedLoad(
		LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
		ArrayRef<unsigned> Indices, unsigned Factor) const {
		if (Factor < 2 \|\| Factor > 4)
		return false;

		assert(!Shuffles.empty() && "Empty shufflevector input");
		assert(Shuffles.size() == Indices.size() &&
		"Unmatched number of shufflevectors and indices");

		const DataLayout *DL = getDataLayout();

		VectorType *VecTy = Shuffles[0]->getType();
		unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);

		// Skip illegal vector types.
		if (VecSize != 64 && VecSize != 128)
		return false;

		// A pointer vector can not be the return type of the ldN intrinsics. Need to
		// load integer vectors first and then convert to pointer vectors.
		Type *EltTy = VecTy->getVectorElementType();
		if (EltTy->isPointerTy())
		VecTy = VectorType::get(DL->getIntPtrType(EltTy),
		VecTy->getVectorNumElements());

		Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
		Type *Tys[2] = {VecTy, PtrTy};
		static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
		Intrinsic::aarch64_neon_ld3,
		Intrinsic::aarch64_neon_ld4};
		Function *LdNFunc =
		Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

		IRBuilder<> Builder(LI);
		Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);

		CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");

		// Replace uses of each shufflevector with the corresponding vector loaded
		// by ldN.
		for (unsigned i = 0; i < Shuffles.size(); i++) {
		ShuffleVectorInst *SVI = Shuffles[i];
		unsigned Index = Indices[i];

		Value *SubVec = Builder.CreateExtractValue(LdN, Index);

		// Convert the integer vector to pointer vector if the element is pointer.
		if (EltTy->isPointerTy())
		SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());

		SVI->replaceAllUsesWith(SubVec);
		}

		return true;
		}

		/// \brief Get a mask consisting of sequential integers starting from \p Start.
		///
		/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
		static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
		unsigned NumElts) {
		SmallVector<Constant *, 16> Mask;
		for (unsigned i = 0; i < NumElts; i++)
		Mask.push_back(Builder.getInt32(Start + i));

		return ConstantVector::get(Mask);
		}

		/// \brief Lower an interleaved store to a stN intrinsic.
		///
		/// E.g. Lower an interleaved store (Factor = 2):
		/// %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
		/// store <8 x i32> %i.vec, <8 x i32>* %ptr
		/// Into:
		/// %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
		/// %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
		/// call void llvm.aarch64.neon.st2(%v0, %v1, %ptr)
		bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
		ShuffleVectorInst *SVI,
		unsigned Factor) const {
		if (Factor < 2 \|\| Factor > 4)
		return false;

		VectorType *VecTy = SVI->getType();
		assert(VecTy->getVectorNumElements() % Factor == 0 &&
		"Invalid interleave factor");

		unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
		Type *EltTy = VecTy->getVectorElementType();
		VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);

		const DataLayout *DL = getDataLayout();
		unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);

		// Skip illegal vector types.
		if (SubVecSize != 64 && SubVecSize != 128)
		return false;

		Value *Op0 = SVI->getOperand(0);
		Value *Op1 = SVI->getOperand(1);
		IRBuilder<> Builder(SI);

		// StN intrinsics don't support pointer vectors as arguments. Convert pointer
		// vectors to integer vectors.
		if (EltTy->isPointerTy()) {
		Type *IntTy = DL->getIntPtrType(EltTy);
		unsigned NumOpElts =
		dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();

		// Convert to the corresponding integer vector.
		Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
		Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
		Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

		SubVecTy = VectorType::get(IntTy, NumSubElts);
		}

		Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
		Type *Tys[2] = {SubVecTy, PtrTy};
		static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
		Intrinsic::aarch64_neon_st3,
		Intrinsic::aarch64_neon_st4};
		Function *StNFunc =
		Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

		SmallVector<Value *, 5> Ops;

		// Split the shufflevector operands into sub vectors for the new stN call.
		for (unsigned i = 0; i < Factor; i++)
		Ops.push_back(Builder.CreateShuffleVector(
		Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));

		Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
		Builder.CreateCall(StNFunc, Ops);
		return true;
		}

static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,		static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
unsigned AlignCheck) {		unsigned AlignCheck) {
return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&		return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));		(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
}		}

EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,		EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
unsigned SrcAlign, bool IsMemset,		unsigned SrcAlign, bool IsMemset,
▲ Show 20 Lines • Show All 2,440 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64TargetMachine.cpp

Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
" optimization pass"), cl::init(true), cl::Hidden);		" optimization pass"), cl::init(true), cl::Hidden);

static cl::opt<bool>		static cl::opt<bool>
EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,		EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
cl::desc("Run SimplifyCFG after expanding atomic operations"		cl::desc("Run SimplifyCFG after expanding atomic operations"
" to make use of cmpxchg flow-based information"),		" to make use of cmpxchg flow-based information"),
cl::init(true));		cl::init(true));

		static cl::opt<bool> AArch64InterleavedAccessOpt(
		"aarch64-interleaved-access-opt",
		cl::desc("Optimize interleaved memory accesses in the AArch64 backend"),
		cl::init(false), cl::Hidden);

static cl::opt<bool>		static cl::opt<bool>
EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,		EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),		cl::desc("Run early if-conversion"),
cl::init(true));		cl::init(true));

static cl::opt<bool>		static cl::opt<bool>
EnableCondOpt("aarch64-condopt",		EnableCondOpt("aarch64-condopt",
cl::desc("Enable the condition optimizer pass"),		cl::desc("Enable the condition optimizer pass"),
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines	void AArch64PassConfig::addIRPasses() {
// Cmpxchg instructions are often used with a subsequent comparison to		// Cmpxchg instructions are often used with a subsequent comparison to
// determine whether it succeeded. We can exploit existing control-flow in		// determine whether it succeeded. We can exploit existing control-flow in
// ldrex/strex loops to simplify this, but it needs tidying up.		// ldrex/strex loops to simplify this, but it needs tidying up.
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)		if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass());		addPass(createCFGSimplificationPass());

TargetPassConfig::addIRPasses();		TargetPassConfig::addIRPasses();

		if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt)
		addPass(createInterleavedAccessPass(TM));

if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {		if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices		// Call SeparateConstOffsetFromGEP pass to extract constants within indices
// and lower a GEP with multiple indices to either arithmetic operations or		// and lower a GEP with multiple indices to either arithmetic operations or
// multiple GEPs with single index.		// multiple GEPs with single index.
addPass(createSeparateConstOffsetFromGEPPass(TM, true));		addPass(createSeparateConstOffsetFromGEPPass(TM, true));
// Call EarlyCSE pass to find and remove subexpressions in the lowered		// Call EarlyCSE pass to find and remove subexpressions in the lowered
// result.		// result.
addPass(createEarlyCSEPass());		addPass(createEarlyCSEPass());
▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 133 Lines • ▼ Show 20 Lines	public:

void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);		void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);

Value getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,		Value getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
Type *ExpectedType);		Type *ExpectedType);

bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);		bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);

		unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
		unsigned Factor,
		ArrayRef<unsigned> Indices,
		unsigned Alignment,
		unsigned AddressSpace);
/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 401 Lines • ▼ Show 20 Lines	if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;		unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.		// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;		return NumVectorizableInstsToAmortize * NumVecElts * 2;
}		}

return LT.first;		return LT.first;
}		}

		unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
		unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
		unsigned Alignment, unsigned AddressSpace) {
		assert(isa<VectorType>(VecTy) && "Expect a vector type");

		if (Factor > 1 && Factor < 5) {
		mzolotukhinUnsubmitted Not Done Reply Inline Actions Nitpick: I'd prefer comparing with 2 and 4, instead of 1 and 5. I.e. if (Factor >= 2 && Factor <= 4) Also, could we somehow reuse `MIN_FACTOR` and `MAX_FACTOR` from `InterleavedAccessPass.cpp` here? Having the same constants in different places will lead to bugs in future. mzolotukhin: Nitpick: I'd prefer comparing with 2 and 4, instead of 1 and 5. I.e. ``` if (Factor >= 2 &&…
		HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions I refactored to add a hook called getMaxSupportedInterleaveFactor(), which is used to share the maximum factor supported by a target. No need to get the minimum factor, which is always 2. HaoLiu: I refactored to add a hook called getMaxSupportedInterleaveFactor(), which is used to share the…
		unsigned NumElts = VecTy->getVectorNumElements();
		Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
		unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);

		// ldN/stN only support legal vector types of size 64 or 128 in bits.
		if (NumElts % Factor == 0 && (SubVecSize == 64 \|\| SubVecSize == 128))
		return Factor;
		}

		return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
		Alignment, AddressSpace);
		}

unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {		unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
unsigned Cost = 0;		unsigned Cost = 0;
for (auto *I : Tys) {		for (auto *I : Tys) {
if (!I->isVectorTy())		if (!I->isVectorTy())
continue;		continue;
if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)		if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +		Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
getMemoryOpCost(Instruction::Load, I, 128, 0);		getMemoryOpCost(Instruction::Load, I, 128, 0);
▲ Show 20 Lines • Show All 104 Lines • Show Last 20 Lines

lib/Target/ARM/ARMISelLowering.h

Show First 20 Lines • Show All 427 Lines • ▼ Show 20 Lines	public:
Value emitStoreConditional(IRBuilder<> &Builder, Value Val,		Value emitStoreConditional(IRBuilder<> &Builder, Value Val,
Value *Addr, AtomicOrdering Ord) const override;		Value *Addr, AtomicOrdering Ord) const override;

Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,		Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
bool IsStore, bool IsLoad) const override;		bool IsStore, bool IsLoad) const override;
Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,		Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
bool IsStore, bool IsLoad) const override;		bool IsStore, bool IsLoad) const override;

		bool lowerInterleavedLoad(LoadInst *LI,
		ArrayRef<ShuffleVectorInst *> Shuffles,
		ArrayRef<unsigned> Indices,
		unsigned Factor) const override;
		bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
		unsigned Factor) const override;

bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;		bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;		bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicRMWExpansionKind		TargetLoweringBase::AtomicRMWExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;		shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

bool useLoadStackGuardNode() const override;		bool useLoadStackGuardNode() const override;

bool canCombineStoreAndExtract(Type VectorTy, Value Idx,		bool canCombineStoreAndExtract(Type VectorTy, Value Idx,
▲ Show 20 Lines • Show All 200 Lines • Show Last 20 Lines

lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 11,351 Lines • ▼ Show 20 Lines	Value ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value Val,
Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);		Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);

return Builder.CreateCall(		return Builder.CreateCall(
Strex, {Builder.CreateZExtOrBitCast(		Strex, {Builder.CreateZExtOrBitCast(
Val, Strex->getFunctionType()->getParamType(0)),		Val, Strex->getFunctionType()->getParamType(0)),
Addr});		Addr});
}		}

		/// \brief Lower an interleaved load to a vldN intrinsic.
		///
		/// E.g. Lower an interleaved load (Factor = 2):
		/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
		/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
		/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
		/// Into:
		/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr)
		/// %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
		/// %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
		bool ARMTargetLowering::lowerInterleavedLoad(
		LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
		ArrayRef<unsigned> Indices, unsigned Factor) const {
		if (Factor < 2 \|\| Factor > 4)
		return false;

		assert(!Shuffles.empty() && "Empty shufflevector input");
		assert(Shuffles.size() == Indices.size() &&
		"Unmatched number of shufflevectors and indices");

		const DataLayout *DL = getDataLayout();

		VectorType *VecTy = Shuffles[0]->getType();
		unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
		bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;

		// Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
		// support i64/f64 element).
		if ((VecSize != 64 && VecSize != 128) \|\| EltIs64Bits)
		return false;

		// A pointer vector can not be the return type of the ldN intrinsics. Need to
		// load integer vectors first and then convert to pointer vectors.
		Type *EltTy = VecTy->getVectorElementType();
		if (EltTy->isPointerTy())
		VecTy = VectorType::get(DL->getIntPtrType(EltTy),
		VecTy->getVectorNumElements());

		static const Intrinsic::ID LoadInt[3] = {Intrinsic::arm_neon_vld2,
		Intrinsic::arm_neon_vld3,
		Intrinsic::arm_neon_vld4};

		Function *VldnFunc =
		Intrinsic::getDeclaration(LI->getModule(), LoadInt[Factor - 2], VecTy);

		IRBuilder<> Builder(LI);
		SmallVector<Value *, 2> Ops;

		Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
		Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
		Ops.push_back(Builder.getInt32(LI->getAlignment()));

		CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");

		// Replace uses of each shufflevector with the corresponding vector loaded
		// by ldN.
		for (unsigned i = 0; i < Shuffles.size(); i++) {
		ShuffleVectorInst *SV = Shuffles[i];
		unsigned Index = Indices[i];

		Value *SubVec = Builder.CreateExtractValue(VldN, Index);

		// Convert the integer vector to pointer vector if the element is pointer.
		if (EltTy->isPointerTy())
		SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());

		SV->replaceAllUsesWith(SubVec);
		}

		return true;
		}

		/// \brief Get a mask consisting of sequential integers starting from \p Start.
		///
		/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
		static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
		unsigned NumElts) {
		SmallVector<Constant *, 16> Mask;
		for (unsigned i = 0; i < NumElts; i++)
		Mask.push_back(Builder.getInt32(Start + i));

		return ConstantVector::get(Mask);
		}

		/// \brief Lower an interleaved store to a vstN intrinsic.
		///
		/// E.g. Lower an interleaved store (Factor = 2):
		/// %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
		/// store <8 x i32> %i.vec, <8 x i32>* %ptr
		/// Into:
		/// %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
		/// %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
		/// call void llvm.arm.neon.vst2(%v0, %v1, %ptr)
		bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
		ShuffleVectorInst *SVI,
		unsigned Factor) const {
		if (Factor < 2 \|\| Factor > 4)
		return false;

		VectorType *VecTy = SVI->getType();
		assert(VecTy->getVectorNumElements() % Factor == 0 &&
		"Invalid interleave factor");

		unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
		Type *EltTy = VecTy->getVectorElementType();
		VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);

		const DataLayout *DL = getDataLayout();
		unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
		bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;

		// Skip illegal sub vector types and vector types of i64/f64 element (vstN
		// doesn't support i64/f64 element).
		if ((SubVecSize != 64 && SubVecSize != 128) \|\| EltIs64Bits)
		return false;

		Value *Op0 = SVI->getOperand(0);
		Value *Op1 = SVI->getOperand(1);
		IRBuilder<> Builder(SI);

		// StN intrinsics don't support pointer vectors as arguments. Convert pointer
		// vectors to integer vectors.
		if (EltTy->isPointerTy()) {
		Type *IntTy = DL->getIntPtrType(EltTy);

		// Convert to the corresponding integer vector.
		Type *IntVecTy =
		VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
		Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
		Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

		SubVecTy = VectorType::get(IntTy, NumSubElts);
		}

		static Intrinsic::ID StoreInt[3] = {Intrinsic::arm_neon_vst2,
		Intrinsic::arm_neon_vst3,
		Intrinsic::arm_neon_vst4};
		Function *VstNFunc = Intrinsic::getDeclaration(
		SI->getModule(), StoreInt[Factor - 2], SubVecTy);

		SmallVector<Value *, 6> Ops;

		Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
		Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));

		// Split the shufflevector operands into sub vectors for the new vstN call.
		for (unsigned i = 0; i < Factor; i++)
		Ops.push_back(Builder.CreateShuffleVector(
		Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));

		Ops.push_back(Builder.getInt32(SI->getAlignment()));
		Builder.CreateCall(VstNFunc, Ops);
		return true;
		}

enum HABaseType {		enum HABaseType {
HA_UNKNOWN = 0,		HA_UNKNOWN = 0,
HA_FLOAT,		HA_FLOAT,
HA_DOUBLE,		HA_DOUBLE,
HA_VECT64,		HA_VECT64,
HA_VECT128		HA_VECT128
};		};

▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

lib/Target/ARM/ARMTargetMachine.cpp

Show All 32 Lines

static cl::opt<bool>		static cl::opt<bool>
EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,		EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
cl::desc("Run SimplifyCFG after expanding atomic operations"		cl::desc("Run SimplifyCFG after expanding atomic operations"
" to make use of cmpxchg flow-based information"),		" to make use of cmpxchg flow-based information"),
cl::init(true));		cl::init(true));

static cl::opt<bool>		static cl::opt<bool>
		ARMInterleavedAccessOpt("arm-interleaved-access-opt", cl::Hidden,
		sbarangaUnsubmitted Not Done Reply Inline Actions Would it be better to only have one switch in the interleave pass instead of having a separate switch in each backend? The pass could return when executing runOnFunction if the option is not enabled. sbaranga: Would it be better to only have one switch in the interleave pass instead of having a separate…
		HaoLiuAuthorUnsubmitted Not Done Reply Inline Actions Reasonable. HaoLiu: Reasonable.
		cl::desc("Optimize interleaved memory accesses"
		" in the ARM backend"),
		cl::init(false));

		static cl::opt<bool>
EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden,		EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden,
cl::desc("Enable ARM load/store optimization pass"),		cl::desc("Enable ARM load/store optimization pass"),
cl::init(true));		cl::init(true));

// FIXME: Unify control over GlobalMerge.		// FIXME: Unify control over GlobalMerge.
static cl::opt<cl::boolOrDefault>		static cl::opt<cl::boolOrDefault>
EnableGlobalMerge("arm-global-merge", cl::Hidden,		EnableGlobalMerge("arm-global-merge", cl::Hidden,
cl::desc("Enable the global merge pass"));		cl::desc("Enable the global merge pass"));
▲ Show 20 Lines • Show All 279 Lines • ▼ Show 20 Lines	void ARMPassConfig::addIRPasses() {
// ldrex/strex loops to simplify this, but it needs tidying up.		// ldrex/strex loops to simplify this, but it needs tidying up.
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)		if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass(-1, [this](const Function &F) {		addPass(createCFGSimplificationPass(-1, [this](const Function &F) {
const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);		const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();		return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
}));		}));

TargetPassConfig::addIRPasses();		TargetPassConfig::addIRPasses();

		if (TM->getOptLevel() != CodeGenOpt::None && ARMInterleavedAccessOpt)
		addPass(createInterleavedAccessPass(TM));
}		}

bool ARMPassConfig::addPreISel() {		bool ARMPassConfig::addPreISel() {
if ((TM->getOptLevel() != CodeGenOpt::None &&		if ((TM->getOptLevel() != CodeGenOpt::None &&
EnableGlobalMerge == cl::BOU_UNSET) \|\|		EnableGlobalMerge == cl::BOU_UNSET) \|\|
EnableGlobalMerge == cl::BOU_TRUE) {		EnableGlobalMerge == cl::BOU_TRUE) {
// FIXME: This is using the thumb1 only constant value for		// FIXME: This is using the thumb1 only constant value for
// maximal global offset for merging globals. We may want		// maximal global offset for merging globals. We may want
▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines

lib/Target/ARM/ARMTargetTransformInfo.h

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	unsigned getArithmeticInstrCost(
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,		TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,		TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);		TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);

unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,		unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);		unsigned AddressSpace);

		unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
		unsigned Factor,
		ArrayRef<unsigned> Indices,
		unsigned Alignment,
		unsigned AddressSpace);
/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

lib/Target/ARM/ARMTargetTransformInfo.cpp

Show First 20 Lines • Show All 472 Lines • ▼ Show 20 Lines	unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
if (Src->isVectorTy() && Alignment != 16 &&		if (Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isDoubleTy()) {		Src->getVectorElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.		// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.		// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
return LT.first * 4;		return LT.first * 4;
}		}
return LT.first;		return LT.first;
}		}

		unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
		unsigned Factor,
		ArrayRef<unsigned> Indices,
		unsigned Alignment,
		unsigned AddressSpace) {
		assert(isa<VectorType>(VecTy) && "Expect a vector type");

		// vldN/vstN doesn't support vector types of i64/f64 element.
		bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;

		if (Factor > 1 && Factor < 5 && !EltIs64Bits) {
		unsigned NumElts = VecTy->getVectorNumElements();
		Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
		unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);

		// vldN/vstN only support legal vector types of size 64 or 128 in bits.
		if (NumElts % Factor == 0 && (SubVecSize == 64 \|\| SubVecSize == 128))
		return Factor;
		}

		return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
		Alignment, AddressSpace);
		}

test/CodeGen/AArch64/aarch64-interleaved-accesses.ll

This file was added.

				; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -aarch64-interleaved-access-opt=true < %s \| FileCheck %s

				; CHECK-LABEL: load_factor2:
				; CHECK: ld2 { v0.8b, v1.8b }, [x0]
				define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
				%wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
				%strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%add = add nsw <8 x i8> %strided.v0, %strided.v1
				ret <8 x i8> %add
				}

				; CHECK-LABEL: load_delat3:
				; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
				define <4 x i32> @load_delat3(i32* %ptr) {
				%base = bitcast i32* %ptr to <12 x i32>*
				mzolotukhinUnsubmitted Not Done Reply Inline Actions s/delat/delta/ mzolotukhin: s/delat/delta/
				%wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
				%strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
				%strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
				%add = add nsw <4 x i32> %strided.v2, %strided.v1
				ret <4 x i32> %add
				}

				; CHECK-LABEL: load_factor4:
				; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
				define <4 x i32> @load_factor4(i32* %ptr) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
				%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
				%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
				%add = add nsw <4 x i32> %strided.v0, %strided.v2
				ret <4 x i32> %add
				}

				; CHECK-LABEL: store_factor2:
				; CHECK: st2 { v0.8b, v1.8b }, [x0]
				define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
				%interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
				store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
				ret void
				}

				; CHECK-LABEL: store_factor3:
				; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
				define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
				store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_factor4:
				; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
				define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
				store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
				ret void
				}

				; The following cases test that interleaved access of pointer vectors can be
				; matched to ldN/stN instruction.

				; CHECK-LABEL: load_ptrvec_factor2:
				; CHECK: ld2 { v0.2d, v1.2d }, [x0]
				define <2 x i32> @load_ptrvec_factor2(i32* %ptr) {
				%base = bitcast i32** %ptr to <4 x i32>
				%wide.vec = load <4 x i32>, <4 x i32>* %base, align 4
				%strided.v0 = shufflevector <4 x i32> %wide.vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
				ret <2 x i32*> %strided.v0
				}

				; CHECK-LABEL: load_ptrvec_factor3:
				; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
				define void @load_ptrvec_factor3(i32** %ptr, <2 x i32> %ptr1, <2 x i32> %ptr2) {
				%base = bitcast i32** %ptr to <6 x i32>
				%wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
				%strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
				store <2 x i32> %strided.v2, <2 x i32>* %ptr1
				%strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
				store <2 x i32> %strided.v1, <2 x i32>* %ptr2
				ret void
				}

				; CHECK-LABEL: load_ptrvec_factor4:
				; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
				define void @load_ptrvec_factor4(i32** %ptr, <2 x i32> %ptr1, <2 x i32> %ptr2) {
				%base = bitcast i32** %ptr to <8 x i32>
				%wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
				%strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
				%strided.v3 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
				store <2 x i32> %strided.v1, <2 x i32>* %ptr1
				store <2 x i32> %strided.v3, <2 x i32>* %ptr2
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor2:
				; CHECK: st2 { v0.2d, v1.2d }, [x0]
				define void @store_ptrvec_factor2(i32** %ptr, <2 x i32> %v0, <2 x i32> %v1) {
				%base = bitcast i32** %ptr to <4 x i32>
				%interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
				store <4 x i32> %interleaved.vec, <4 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor3:
				; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
				define void @store_ptrvec_factor3(i32** %ptr, <2 x i32> %v0, <2 x i32> %v1, <2 x i32*> %v2) {
				%base = bitcast i32** %ptr to <6 x i32>
				%v0_v1 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%v2_u = shufflevector <2 x i32> %v2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <4 x i32> %v0_v1, <4 x i32> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
				store <6 x i32> %interleaved.vec, <6 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor4:
				; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
				define void @store_ptrvec_factor4(i32* %ptr, <2 x i32> %v0, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
				%base = bitcast i32* %ptr to <8 x i32>
				%v0_v1 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%v2_v3 = shufflevector <2 x i32> %v2, <2 x i32> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%interleaved.vec = shufflevector <4 x i32> %v0_v1, <4 x i32> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
				store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
				ret void
				}

				; Following cases check that shuffle maskes with undef indices can be matched
				; into ldN/stN instruction.

				; CHECK-LABEL: load_undef_mask_factor2:
				; CHECK: ld2 { v0.4s, v1.4s }, [x0]
				define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
				%base = bitcast i32* %ptr to <8 x i32>*
				%wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
				%strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
				%strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
				%add = add nsw <4 x i32> %strided.v0, %strided.v1
				ret <4 x i32> %add
				}

				; CHECK-LABEL: load_undef_mask_factor3:
				; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
				define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
				%strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
				%strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
				%add = add nsw <4 x i32> %strided.v2, %strided.v1
				ret <4 x i32> %add
				}

				; CHECK-LABEL: load_undef_mask_factor4:
				; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
				define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
				%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
				%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
				%add = add nsw <4 x i32> %strided.v0, %strided.v2
				ret <4 x i32> %add
				}

				; CHECK-LABEL: store_undef_mask_factor2:
				; CHECK: st2 { v0.4s, v1.4s }, [x0]
				define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
				%base = bitcast i32* %ptr to <8 x i32>*
				%interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
				store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_undef_mask_factor3:
				; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
				define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
				store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_undef_mask_factor4:
				; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
				define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
				store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
				ret void
				}

test/CodeGen/ARM/arm-interleaved-accesses.ll

This file was added.

				; RUN: llc -mtriple=arm-eabi -mattr=+neon -arm-interleaved-access-opt=true < %s \| FileCheck %s

				; CHECK-LABEL: load_factor2:
				; CHECK: vld2.8 {d16, d17}, [r0]
				define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
				%wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
				%strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%add = add nsw <8 x i8> %strided.v0, %strided.v1
				ret <8 x i8> %add
				}

				; CHECK-LABEL: load_delat3:
				; CHECK: vld3.32 {d16, d17, d18}, [r0]
				define <2 x i32> @load_delat3(i32* %ptr) {
				%base = bitcast i32* %ptr to <6 x i32>*
				mzolotukhinUnsubmitted Not Done Reply Inline Actions s/delat/delta/ ? mzolotukhin: s/delat/delta/ ?
				%wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
				%strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
				%strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
				%add = add nsw <2 x i32> %strided.v2, %strided.v1
				ret <2 x i32> %add
				}

				; CHECK-LABEL: load_factor4:
				; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
				; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
				define <4 x i32> @load_factor4(i32* %ptr) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
				%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
				%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
				%add = add nsw <4 x i32> %strided.v0, %strided.v2
				ret <4 x i32> %add
				}

				; CHECK-LABEL: store_factor2:
				; CHECK: vst2.8 {d16, d17}, [r0]
				define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
				%interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
				store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
				ret void
				}

				; CHECK-LABEL: store_factor3:
				; CHECK: vst3.32 {d16, d18, d20}, [r0]!
				; CHECK: vst3.32 {d17, d19, d21}, [r0]
				define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
				store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_factor4:
				; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
				; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
				define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
				store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
				ret void
				}

				; The following cases test that interleaved access of pointer vectors can be
				; matched to ldN/stN instruction.

				; CHECK-LABEL: load_ptrvec_factor2:
				; CHECK: vld2.32 {d16, d17}, [r0]
				define <2 x i32> @load_ptrvec_factor2(i32* %ptr) {
				%base = bitcast i32** %ptr to <4 x i32>
				%wide.vec = load <4 x i32>, <4 x i32>* %base, align 4
				%strided.v0 = shufflevector <4 x i32> %wide.vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
				ret <2 x i32*> %strided.v0
				}

				; CHECK-LABEL: load_ptrvec_factor3:
				; CHECK: vld3.32 {d16, d17, d18}, [r0]
				define void @load_ptrvec_factor3(i32** %ptr, <2 x i32> %ptr1, <2 x i32> %ptr2) {
				%base = bitcast i32** %ptr to <6 x i32>
				%wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
				%strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
				store <2 x i32> %strided.v2, <2 x i32>* %ptr1
				%strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
				store <2 x i32> %strided.v1, <2 x i32>* %ptr2
				ret void
				}

				; CHECK-LABEL: load_ptrvec_factor4:
				; CHECK: vld4.32 {d16, d17, d18, d19}, [r0]
				define void @load_ptrvec_factor4(i32** %ptr, <2 x i32> %ptr1, <2 x i32> %ptr2) {
				%base = bitcast i32** %ptr to <8 x i32>
				%wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
				%strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
				%strided.v3 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
				store <2 x i32> %strided.v1, <2 x i32>* %ptr1
				store <2 x i32> %strided.v3, <2 x i32>* %ptr2
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor2:
				; CHECK: vst2.32 {d16, d17}, [r0]
				define void @store_ptrvec_factor2(i32** %ptr, <2 x i32> %v0, <2 x i32> %v1) {
				%base = bitcast i32** %ptr to <4 x i32>
				%interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
				store <4 x i32> %interleaved.vec, <4 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor3:
				; CHECK: vst3.32 {d16, d17, d18}, [r0]
				define void @store_ptrvec_factor3(i32** %ptr, <2 x i32> %v0, <2 x i32> %v1, <2 x i32*> %v2) {
				%base = bitcast i32** %ptr to <6 x i32>
				%v0_v1 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%v2_u = shufflevector <2 x i32> %v2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <4 x i32> %v0_v1, <4 x i32> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
				store <6 x i32> %interleaved.vec, <6 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_ptrvec_factor4:
				; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
				define void @store_ptrvec_factor4(i32* %ptr, <2 x i32> %v0, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
				%base = bitcast i32* %ptr to <8 x i32>
				%v0_v1 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%v2_v3 = shufflevector <2 x i32> %v2, <2 x i32> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%interleaved.vec = shufflevector <4 x i32> %v0_v1, <4 x i32> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
				store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
				ret void
				}

				; Following cases check that shuffle maskes with undef indices can be matched
				; into ldN/stN instruction.

				; CHECK-LABEL: load_undef_mask_factor2:
				; CHECK: vld2.32 {d16, d17, d18, d19}, [r0]
				define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
				%base = bitcast i32* %ptr to <8 x i32>*
				%wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
				%strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
				%strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
				%add = add nsw <4 x i32> %strided.v0, %strided.v1
				ret <4 x i32> %add
				}

				; CHECK-LABEL: load_undef_mask_factor3:
				; CHECK: vld3.32 {d16, d18, d20}, [r0]!
				; CHECK: vld3.32 {d17, d19, d21}, [r0]
				define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
				%strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
				%strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
				%add = add nsw <4 x i32> %strided.v2, %strided.v1
				ret <4 x i32> %add
				}

				; CHECK-LABEL: load_undef_mask_factor4:
				; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
				; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
				define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
				%strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
				%strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
				%add = add nsw <4 x i32> %strided.v0, %strided.v2
				ret <4 x i32> %add
				}

				; CHECK-LABEL: store_undef_mask_factor2:
				; CHECK: vst2.32 {d16, d17, d18, d19}, [r0]
				define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
				%base = bitcast i32* %ptr to <8 x i32>*
				%interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
				store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_undef_mask_factor3:
				; CHECK: vst3.32 {d16, d18, d20}, [r0]!
				; CHECK: vst3.32 {d17, d19, d21}, [r0]
				define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
				%base = bitcast i32* %ptr to <12 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
				store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
				ret void
				}

				; CHECK-LABEL: store_undef_mask_factor4:
				; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
				; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
				define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
				%base = bitcast i32* %ptr to <16 x i32>*
				%v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
				store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][ARM] Match interleaved memory accesses into ldN/stN/vldN/vstN intrinsics.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 27923

include/llvm/CodeGen/Passes.h

include/llvm/Target/TargetLowering.h

lib/CodeGen/CMakeLists.txt

lib/CodeGen/InterleavedAccessPass.cpp

lib/Target/AArch64/AArch64ISelLowering.h

lib/Target/AArch64/AArch64ISelLowering.cpp

lib/Target/AArch64/AArch64TargetMachine.cpp

lib/Target/AArch64/AArch64TargetTransformInfo.h

lib/Target/AArch64/AArch64TargetTransformInfo.cpp

lib/Target/ARM/ARMISelLowering.h

lib/Target/ARM/ARMISelLowering.cpp

lib/Target/ARM/ARMTargetMachine.cpp

lib/Target/ARM/ARMTargetTransformInfo.h

lib/Target/ARM/ARMTargetTransformInfo.cpp

test/CodeGen/AArch64/aarch64-interleaved-accesses.ll

test/CodeGen/ARM/arm-interleaved-accesses.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][ARM] Match interleaved memory accesses into ldN/stN/vldN/vstN intrinsics.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 27923

include/llvm/CodeGen/Passes.h

include/llvm/Target/TargetLowering.h

lib/CodeGen/CMakeLists.txt

lib/CodeGen/InterleavedAccessPass.cpp

lib/Target/AArch64/AArch64ISelLowering.h

lib/Target/AArch64/AArch64ISelLowering.cpp

lib/Target/AArch64/AArch64TargetMachine.cpp

lib/Target/AArch64/AArch64TargetTransformInfo.h

lib/Target/AArch64/AArch64TargetTransformInfo.cpp

lib/Target/ARM/ARMISelLowering.h

lib/Target/ARM/ARMISelLowering.cpp

lib/Target/ARM/ARMTargetMachine.cpp

lib/Target/ARM/ARMTargetTransformInfo.h

lib/Target/ARM/ARMTargetTransformInfo.cpp

test/CodeGen/AArch64/aarch64-interleaved-accesses.ll

test/CodeGen/ARM/arm-interleaved-accesses.ll

[AArch64][ARM] Match interleaved memory accesses into ldN/stN/vldN/vstN intrinsics.
ClosedPublic