Diff 169270

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 939 Lines • ▼ Show 20 Lines	public:

/// \returns True if the load instruction is legal to vectorize.		/// \returns True if the load instruction is legal to vectorize.
bool isLegalToVectorizeLoad(LoadInst *LI) const;		bool isLegalToVectorizeLoad(LoadInst *LI) const;

/// \returns True if the store instruction is legal to vectorize.		/// \returns True if the store instruction is legal to vectorize.
bool isLegalToVectorizeStore(StoreInst *SI) const;		bool isLegalToVectorizeStore(StoreInst *SI) const;

/// \returns True if it is legal to vectorize the given load chain.		/// \returns True if it is legal to vectorize the given load chain.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;

/// \returns True if it is legal to vectorize the given store chain.		/// \returns True if it is legal to vectorize the given store chain.
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;

/// \returns The new vector factor value if the target doesn't support \p		/// \returns The new vector factor value if the target doesn't support \p
/// SizeInBytes loads or has a better vector factor.		/// SizeInBytes loads or has a better vector factor.
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const;		VectorType *VecTy) const;

/// \returns The new vector factor value if the target doesn't support \p		/// \returns The new vector factor value if the target doesn't support \p
▲ Show 20 Lines • Show All 197 Lines • ▼ Show 20 Lines	virtual void getMemcpyLoopResidualLoweringType(
unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;		unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
virtual bool areInlineCompatible(const Function *Caller,		virtual bool areInlineCompatible(const Function *Caller,
const Function *Callee) const = 0;		const Function *Callee) const = 0;
virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;		virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;		virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;
virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;		virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;		virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;		virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		virtual bool isLegalToVectorizeLoadChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const = 0;		const DataLayout &DL) const = 0;
virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		virtual bool isLegalToVectorizeStoreChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const = 0;		const DataLayout &DL) const = 0;
virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const = 0;		VectorType *VecTy) const = 0;
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,		virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const = 0;		VectorType *VecTy) const = 0;
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,		virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;		ReductionFlags) const = 0;
▲ Show 20 Lines • Show All 361 Lines • ▼ Show 20 Lines	unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
return Impl.getLoadStoreVecRegBitWidth(AddrSpace);		return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
}		}
bool isLegalToVectorizeLoad(LoadInst *LI) const override {		bool isLegalToVectorizeLoad(LoadInst *LI) const override {
return Impl.isLegalToVectorizeLoad(LI);		return Impl.isLegalToVectorizeLoad(LI);
}		}
bool isLegalToVectorizeStore(StoreInst *SI) const override {		bool isLegalToVectorizeStore(StoreInst *SI) const override {
return Impl.isLegalToVectorizeStore(SI);		return Impl.isLegalToVectorizeStore(SI);
}		}
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const override {		const DataLayout &DL) const override {
return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,		return Impl.isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes,
AddrSpace);		Alignment, DL);
}		}
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const override {		const DataLayout &DL) const override {
return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,		return Impl.isLegalToVectorizeStoreChain(BasePtr, ChainSizeInBytes,
AddrSpace);		Alignment, DL);
}		}
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const override {		VectorType *VecTy) const override {
return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);		return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
}		}
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,		unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
▲ Show 20 Lines • Show All 115 Lines • Show Last 20 Lines

include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 532 Lines • ▼ Show 20 Lines	public:
}		}

unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 128; }		unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 128; }

bool isLegalToVectorizeLoad(LoadInst *LI) const { return true; }		bool isLegalToVectorizeLoad(LoadInst *LI) const { return true; }

bool isLegalToVectorizeStore(StoreInst *SI) const { return true; }		bool isLegalToVectorizeStore(StoreInst *SI) const { return true; }

bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return true;		return true;
}		}

bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return true;		return true;
}		}

unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const {		VectorType *VecTy) const {
return VF;		return VF;
}		}
▲ Show 20 Lines • Show All 298 Lines • Show Last 20 Lines

lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 629 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::isLegalToVectorizeLoad(LoadInst *LI) const {
return TTIImpl->isLegalToVectorizeLoad(LI);		return TTIImpl->isLegalToVectorizeLoad(LI);
}		}

bool TargetTransformInfo::isLegalToVectorizeStore(StoreInst *SI) const {		bool TargetTransformInfo::isLegalToVectorizeStore(StoreInst *SI) const {
return TTIImpl->isLegalToVectorizeStore(SI);		return TTIImpl->isLegalToVectorizeStore(SI);
}		}

bool TargetTransformInfo::isLegalToVectorizeLoadChain(		bool TargetTransformInfo::isLegalToVectorizeLoadChain(
unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const {		Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment,
return TTIImpl->isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,		const DataLayout &DL) const {
AddrSpace);		return TTIImpl->isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes,
		Alignment, DL);
}		}

bool TargetTransformInfo::isLegalToVectorizeStoreChain(		bool TargetTransformInfo::isLegalToVectorizeStoreChain(
unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const {		Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment,
return TTIImpl->isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,		const DataLayout &DL) const {
AddrSpace);		return TTIImpl->isLegalToVectorizeStoreChain(BasePtr, ChainSizeInBytes,
		Alignment, DL);
}		}

unsigned TargetTransformInfo::getLoadVectorFactor(unsigned VF,		unsigned TargetTransformInfo::getLoadVectorFactor(unsigned VF,
unsigned LoadSize,		unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const {		VectorType *VecTy) const {
return TTIImpl->getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);		return TTIImpl->getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
}		}
▲ Show 20 Lines • Show All 541 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Show First 20 Lines • Show All 139 Lines • ▼ Show 20 Lines	public:
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const;		VectorType *VecTy) const;
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,		unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const;		VectorType *VecTy) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;		unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;

bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeMemChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;

unsigned getMaxInterleaveFactor(unsigned VF);		unsigned getMaxInterleaveFactor(unsigned VF);

bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;		bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;

int getArithmeticInstrCost(		int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,		unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,		TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	public:

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,		void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);		TTI::UnrollingPreferences &UP);
unsigned getHardwareNumberOfRegisters(bool Vec) const;		unsigned getHardwareNumberOfRegisters(bool Vec) const;
unsigned getNumberOfRegisters(bool Vec) const;		unsigned getNumberOfRegisters(bool Vec) const;
unsigned getRegisterBitWidth(bool Vector) const;		unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;		unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;		unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,		bool isLegalToVectorizeMemChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const;		const DataLayout &DL) const;
		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
		unsigned Alignment,
		const DataLayout &DL) const;
unsigned getMaxInterleaveFactor(unsigned VF);		unsigned getMaxInterleaveFactor(unsigned VF);
unsigned getCFInstrCost(unsigned Opcode);		unsigned getCFInstrCost(unsigned Opcode);
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);		int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
};		};

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H		#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Show First 20 Lines • Show All 263 Lines • ▼ Show 20 Lines	if (AddrSpace == AMDGPUAS::FLAT_ADDRESS \|\|
return 128;		return 128;

if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)		if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();		return 8 * ST->getMaxPrivateElementSize();

llvm_unreachable("unhandled address space");		llvm_unreachable("unhandled address space");
}		}

bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,		bool GCNTTIImpl::isLegalToVectorizeMemChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
		auto Ty = cast<PointerType>(BasePtr->getType());
		unsigned AS = Ty->getAddressSpace();

// We allow vectorization of flat stores, even though we may need to decompose		// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context		// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.		// here, and legalization can handle it.
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {		if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
return (Alignment >= 4 \|\| ST->hasUnalignedScratchAccess()) &&		return (Alignment >= 4 \|\| ST->hasUnalignedScratchAccess()) &&
ChainSizeInBytes <= ST->getMaxPrivateElementSize();		ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}		}

		// SI has a hardware bug in the LDS / GDS bounds checking: if the base address
		// is negative, then the instruction is incorrectly treated as out-of-bounds
		// even if base + offset is in bounds. This affects the high words of a
		// vectorized load / store.
		if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
		(AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)) {
		if (ChainSizeInBytes <= Alignment)
return true;		return true;

		KnownBits Bits = computeKnownBits(BasePtr, DL);
		arsenmUnsubmitted Not Done Reply Inline Actions This seems like an expensive check for this. Is this so important? arsenm: This seems like an expensive check for this. Is this so important?
		arsenmUnsubmitted Not Done Reply Inline Actions I mean I don't understand why this would really matter that much. If we ignore this problem and let it vectorize, the resulting code shouldn't be that different when selection fixes it. The advantage is just making the IR closer to the final hardware instructions, which has minor cost analysis benefits? arsenm: I mean I don't understand why this would really matter that much. If we ignore this problem and…
		return Bits.isNonNegative();
}		}

bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		return true;
		}

		bool GCNTTIImpl::isLegalToVectorizeLoadChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL);
}		}

bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool GCNTTIImpl::isLegalToVectorizeStoreChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL);
}		}

unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {		unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.		// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.		// TODO: Enable this again.
if (VF == 1)		if (VF == 1)
return 1;		return 1;

▲ Show 20 Lines • Show All 350 Lines • ▼ Show 20 Lines	unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS \|\|		if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS \|\|
AddrSpace == AMDGPUAS::PARAM_I_ADDRESS \|\|		AddrSpace == AMDGPUAS::PARAM_I_ADDRESS \|\|
(AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&		(AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))		AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
return 128;		return 128;
llvm_unreachable("unhandled address space");		llvm_unreachable("unhandled address space");
}		}

bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,		bool R600TTIImpl::isLegalToVectorizeMemChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
// We allow vectorization of flat stores, even though we may need to decompose		// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context		// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.		// here, and legalization can handle it.
return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);		unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
		return AS != AMDGPUAS::PRIVATE_ADDRESS;
}		}

bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool R600TTIImpl::isLegalToVectorizeLoadChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL);
}		}

bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool R600TTIImpl::isLegalToVectorizeStoreChain(Value *BasePtr,
		unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL);
}		}

unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {		unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.		// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.		// TODO: Enable this again.
if (VF == 1)		if (VF == 1)
return 1;		return 1;

▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	public:
bool isSourceOfDivergence(const Value *V);		bool isSourceOfDivergence(const Value *V);

unsigned getFlatAddressSpace() const {		unsigned getFlatAddressSpace() const {
return AddressSpace::ADDRESS_SPACE_GENERIC;		return AddressSpace::ADDRESS_SPACE_GENERIC;
}		}

// Loads and stores can be vectorized if the alignment is at least as big as		// Loads and stores can be vectorized if the alignment is at least as big as
// the load/store we want to vectorize.		// the load/store we want to vectorize.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return Alignment >= ChainSizeInBytes;		return Alignment >= ChainSizeInBytes;
}		}
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,		bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes,
unsigned Alignment,		unsigned Alignment,
unsigned AddrSpace) const {		const DataLayout &DL) const {
return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace);		return isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes, Alignment,
		DL);
}		}

// NVPTX has infinite registers of all kinds, but the actual machine doesn't.		// NVPTX has infinite registers of all kinds, but the actual machine doesn't.
// We conservatively return 1 here which is just enough to enable the		// We conservatively return 1 here which is just enough to enable the
// vectorizers but disables heuristics based on the number of registers.		// vectorizers but disables heuristics based on the number of registers.
// FIXME: Return a more reasonable number, while keeping an eye on		// FIXME: Return a more reasonable number, while keeping an eye on
// LoopVectorizer's unrolling heuristics.		// LoopVectorizer's unrolling heuristics.
unsigned getNumberOfRegisters(bool Vector) const { return 1; }		unsigned getNumberOfRegisters(bool Vector) const { return 1; }
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Show First 20 Lines • Show All 993 Lines • ▼ Show 20 Lines	if (accessIsMisaligned(SzInBytes, AS, Alignment)) {

unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),		unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
StackAdjustedAlignment,		StackAdjustedAlignment,
DL, S0, nullptr, &DT);		DL, S0, nullptr, &DT);
if (NewAlign != 0)		if (NewAlign != 0)
Alignment = NewAlign;		Alignment = NewAlign;
}		}

if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {		if (!TTI.isLegalToVectorizeStoreChain(S0->getPointerOperand(), SzInBytes,
		Alignment, DL)) {
auto Chains = splitOddVectorElts(Chain, Sz);		auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeStoreChain(Chains.first, InstructionsProcessed) \|		return vectorizeStoreChain(Chains.first, InstructionsProcessed) \|
vectorizeStoreChain(Chains.second, InstructionsProcessed);		vectorizeStoreChain(Chains.second, InstructionsProcessed);
}		}

BasicBlock::iterator First, Last;		BasicBlock::iterator First, Last;
std::tie(First, Last) = getBoundaryInstrs(Chain);		std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*Last);		Builder.SetInsertPoint(&*Last);
▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines	unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
StackAdjustedAlignment,		StackAdjustedAlignment,
DL, L0, nullptr, &DT);		DL, L0, nullptr, &DT);
if (NewAlign != 0)		if (NewAlign != 0)
Alignment = NewAlign;		Alignment = NewAlign;

Alignment = NewAlign;		Alignment = NewAlign;
}		}

if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {		if (!TTI.isLegalToVectorizeLoadChain(L0->getPointerOperand(), SzInBytes,
		Alignment, DL)) {
auto Chains = splitOddVectorElts(Chain, Sz);		auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeLoadChain(Chains.first, InstructionsProcessed) \|		return vectorizeLoadChain(Chains.first, InstructionsProcessed) \|
vectorizeLoadChain(Chains.second, InstructionsProcessed);		vectorizeLoadChain(Chains.second, InstructionsProcessed);
}		}

LLVM_DEBUG({		LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";		dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)		for (Instruction *I : Chain)
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

test/Transforms/LoadStoreVectorizer/AMDGPU/ds-bounds.ll

This file was added.

				; RUN: opt -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -load-store-vectorizer -S -o - %s \| FileCheck -check-prefixes=ALL,SI %s
				; RUN: opt -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -load-store-vectorizer -S -o - %s \| FileCheck -check-prefixes=ALL,NONSI %s

				target datalayout = "e-p:64:64-p1:64:64-p2:32:32:32:32:16-p3:32:32:32:32:16-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

				@compute_lds = external addrspace(3) global [512 x i32], align 16

				; ALL-LABEL: @store_aligned(
				; ALL: store <2 x i32> <i32 42, i32 43>, <2 x i32> addrspace(3)* %0, align 8
				define amdgpu_cs void @store_aligned(i32 addrspace(3)* %ptr) #0 {
				entry:
				%ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1

				store i32 42, i32 addrspace(3)* %ptr, align 8
				store i32 43, i32 addrspace(3)* %ptr.gep.1
				ret void
				}


				; ALL-LABEL: @store_global_const_idx(
				;
				; TODO: Addresses are known-positive, this could be merged!
				; SI: store i32
				; SI: store i32
				;
				; NONSI: store <2 x i32> <i32 42, i32 43>, <2 x i32> addrspace(3)* %0, align 4
				define amdgpu_cs void @store_global_const_idx() #0 {
				entry:
				%ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
				%ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4

				store i32 42, i32 addrspace(3)* %ptr.a
				store i32 43, i32 addrspace(3)* %ptr.b
				ret void
				}


				; ALL-LABEL: @store_global_var_idx_case1(
				; SI: store i32
				; SI: store i32
				; NONSI: store <2 x i32> <i32 42, i32 43>, <2 x i32> addrspace(3)* %0, align 4
				define amdgpu_cs void @store_global_var_idx_case1(i32 %idx) #0 {
				entry:
				%ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
				%ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1

				store i32 42, i32 addrspace(3)* %ptr.a
				store i32 43, i32 addrspace(3)* %ptr.b
				ret void
				}


				; ALL-LABEL: @store_global_var_idx_case2(
				;
				; TODO: Addresses are known-positive, this could be merged!
				; SI: store i32
				; SI: store i32
				;
				; NONSI: store <2 x i32> <i32 42, i32 43>, <2 x i32> addrspace(3)* %0, align 4
				define amdgpu_cs void @store_global_var_idx_case2(i32 %idx) #0 {
				entry:
				%idx.and = and i32 %idx, 255
				%ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
				%ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1

				store i32 42, i32 addrspace(3)* %ptr.a
				store i32 43, i32 addrspace(3)* %ptr.b
				ret void
				}


				attributes #0 = { nounwind }

test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s \| FileCheck %s		; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=verde -load-store-vectorizer -S -o - %s \| FileCheck --check-prefixes=CHECK,SI %s
		; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -load-store-vectorizer -S -o - %s \| FileCheck --check-prefixes=CHECK,NONSI %s
; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions		; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"		target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

; TODO: Vector element tests		; TODO: Vector element tests
; TODO: Non-zero base offset for load and store combinations		; TODO: Non-zero base offset for load and store combinations
; TODO: Same base addrspacecasted		; TODO: Same base addrspacecasted

▲ Show 20 Lines • Show All 468 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
%out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1		%out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1

store i8 123, i8 addrspace(3)* %out.gep.1		store i8 123, i8 addrspace(3)* %out.gep.1
store i8 456, i8 addrspace(3)* %out, align 2		store i8 456, i8 addrspace(3)* %out, align 2
ret void		ret void
}		}

; CHECK-LABEL: @merge_local_store_2_constants_i32		; CHECK-LABEL: @merge_local_store_2_constants_i32
; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4		; SI: store i32
		; SI: store i32
		; NONSI: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {		define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1		%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1

store i32 123, i32 addrspace(3)* %out.gep.1		store i32 123, i32 addrspace(3)* %out.gep.1
store i32 456, i32 addrspace(3)* %out		store i32 456, i32 addrspace(3)* %out
ret void		ret void
}		}

; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2		; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
; CHECK: store i32		; CHECK: store i32
; CHECK: store i32		; CHECK: store i32
define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {		define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1		%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1

store i32 123, i32 addrspace(3)* %out.gep.1, align 2		store i32 123, i32 addrspace(3)* %out.gep.1, align 2
store i32 456, i32 addrspace(3)* %out, align 2		store i32 456, i32 addrspace(3)* %out, align 2
ret void		ret void
}		}

; CHECK-LABEL: @merge_local_store_4_constants_i32		; CHECK-LABEL: @merge_local_store_4_constants_i32
; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)*		; SI: store i32
		; SI: store i32
		; SI: store i32
		; SI: store i32
		; NONSI: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)*
define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {		define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1		%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2		%out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3		%out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3

store i32 123, i32 addrspace(3)* %out.gep.1		store i32 123, i32 addrspace(3)* %out.gep.1
store i32 456, i32 addrspace(3)* %out.gep.2		store i32 456, i32 addrspace(3)* %out.gep.2
store i32 333, i32 addrspace(3)* %out.gep.3		store i32 333, i32 addrspace(3)* %out.gep.3
▲ Show 20 Lines • Show All 142 Lines • Show Last 20 Lines

test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll

	; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s \| FileCheck %s			; RUN: opt -mtriple=amdgcn-- -mcpu=bonaire -load-store-vectorizer -S -o - %s \| FileCheck %s

	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"			target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

	@lds = internal addrspace(3) global [512 x float] undef, align 4			@lds = internal addrspace(3) global [512 x float] undef, align 4

	; The original load has an implicit alignment of 4, and should not			; The original load has an implicit alignment of 4, and should not
	; increase to an align 8 load.			; increase to an align 8 load.

	Show All 23 Lines

test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll

	; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s			; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s

	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"			target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

	; Checks that there is no crash when there are multiple tails			; Checks that there is no crash when there are multiple tails
	; for a the same head starting a chain.			; for a the same head starting a chain.
	@0 = internal addrspace(3) global [16384 x i32] undef			@0 = internal addrspace(3) global [16384 x i32] undef

	; CHECK-LABEL: @no_crash(			; CHECK-LABEL: @no_crash(
	▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll

	; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s			; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s

	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"			target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

	declare i32 @llvm.amdgcn.workitem.id.x() #1			declare i32 @llvm.amdgcn.workitem.id.x() #1

	; CHECK-LABEL: @merge_v2p1i8(			; CHECK-LABEL: @merge_v2p1i8(
	; CHECK: load <2 x i64>			; CHECK: load <2 x i64>
	; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*			; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
	▲ Show 20 Lines • Show All 302 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Avoid selecting ds_{read,write}2_b32 on SI
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 169270

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

lib/Analysis/TargetTransformInfo.cpp

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

lib/Target/NVPTX/NVPTXTargetTransformInfo.h

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

test/Transforms/LoadStoreVectorizer/AMDGPU/ds-bounds.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Avoid selecting ds_{read,write}2_b32 on SIClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 169270

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

lib/Analysis/TargetTransformInfo.cpp

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

lib/Target/NVPTX/NVPTXTargetTransformInfo.h

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

test/Transforms/LoadStoreVectorizer/AMDGPU/ds-bounds.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll

test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll

AMDGPU: Avoid selecting ds_{read,write}2_b32 on SI
ClosedPublic