Diff 120814

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 548 Lines • ▼ Show 20 Lines	public:
/// If target has efficient vector element load/store instructions, it can		/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to		/// return true here so that insertion/extraction costs are not added to
/// the scalarization cost of a load/store.		/// the scalarization cost of a load/store.
bool supportsEfficientVectorElementLoadStore() const;		bool supportsEfficientVectorElementLoadStore() const;

/// \brief Don't restrict interleaved unrolling to small loops.		/// \brief Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;		bool enableAggressiveInterleaving(bool LoopHasReductions) const;

/// \brief Enable inline expansion of memcmp		/// \brief If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
bool enableMemCmpExpansion(unsigned &MaxLoadSize) const;		/// true if this is the expansion of memcmp(p1, p2, s) == 0.
		struct MemCmpExpansionOptions {
		// The list of available load sizes (in bytes), sorted in decreasing order.
		SmallVector<unsigned, 8> LoadSizes;
		};
		const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;

		spatelUnsubmitted Done Reply Inline Actions We should avoid using different vocabulary in this API than what is in the expansion code. Instead of 'IsThreeWay' here and other places, we can use 'IsZeroCmp'? spatel: We should avoid using different vocabulary in this API than what is in the expansion code.
/// \brief Enable matching of interleaved access groups.		/// \brief Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;		bool enableInterleavedAccessVectorization() const;

/// \brief Indicate that it is potentially unsafe to automatically vectorize		/// \brief Indicate that it is potentially unsafe to automatically vectorize
/// floating-point operations because the semantics of vector and scalar		/// floating-point operations because the semantics of vector and scalar
/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math		/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
/// does not support IEEE-754 denormal numbers, while depending on the		/// does not support IEEE-754 denormal numbers, while depending on the
/// platform, scalar floating-point math does.		/// platform, scalar floating-point math does.
▲ Show 20 Lines • Show All 420 Lines • ▼ Show 20 Lines	public:
virtual bool shouldBuildLookupTables() = 0;		virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;		virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual unsigned		virtual unsigned
getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;		getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) = 0;		unsigned VF) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;		virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;		virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual bool enableMemCmpExpansion(unsigned &MaxLoadSize) = 0;		virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
		bool IsZeroCmp) const = 0;
virtual bool enableInterleavedAccessVectorization() = 0;		virtual bool enableInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;		virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,		virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
unsigned BitWidth,		unsigned BitWidth,
unsigned AddressSpace,		unsigned AddressSpace,
unsigned Alignment,		unsigned Alignment,
bool *Fast) = 0;		bool *Fast) = 0;
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;		virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
▲ Show 20 Lines • Show All 236 Lines • ▼ Show 20 Lines	public:

bool supportsEfficientVectorElementLoadStore() override {		bool supportsEfficientVectorElementLoadStore() override {
return Impl.supportsEfficientVectorElementLoadStore();		return Impl.supportsEfficientVectorElementLoadStore();
}		}

bool enableAggressiveInterleaving(bool LoopHasReductions) override {		bool enableAggressiveInterleaving(bool LoopHasReductions) override {
return Impl.enableAggressiveInterleaving(LoopHasReductions);		return Impl.enableAggressiveInterleaving(LoopHasReductions);
}		}
bool enableMemCmpExpansion(unsigned &MaxLoadSize) override {		const MemCmpExpansionOptions *enableMemCmpExpansion(
return Impl.enableMemCmpExpansion(MaxLoadSize);		bool IsZeroCmp) const override {
		return Impl.enableMemCmpExpansion(IsZeroCmp);
}		}
bool enableInterleavedAccessVectorization() override {		bool enableInterleavedAccessVectorization() override {
return Impl.enableInterleavedAccessVectorization();		return Impl.enableInterleavedAccessVectorization();
}		}
bool isFPVectorizationPotentiallyUnsafe() override {		bool isFPVectorizationPotentiallyUnsafe() override {
return Impl.isFPVectorizationPotentiallyUnsafe();		return Impl.isFPVectorizationPotentiallyUnsafe();
}		}
bool allowsMisalignedMemoryAccesses(LLVMContext &Context,		bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
▲ Show 20 Lines • Show All 317 Lines • Show Last 20 Lines

include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 288 Lines • ▼ Show 20 Lines	public:

unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) { return 0; }		unsigned VF) { return 0; }

bool supportsEfficientVectorElementLoadStore() { return false; }		bool supportsEfficientVectorElementLoadStore() { return false; }

bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }		bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }

bool enableMemCmpExpansion(unsigned &MaxLoadSize) { return false; }		const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
		bool IsZeroCmp) const {
		return nullptr;
		}

bool enableInterleavedAccessVectorization() { return false; }		bool enableInterleavedAccessVectorization() { return false; }

bool isFPVectorizationPotentiallyUnsafe() { return false; }		bool isFPVectorizationPotentiallyUnsafe() { return false; }

bool allowsMisalignedMemoryAccesses(LLVMContext &Context,		bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
unsigned BitWidth,		unsigned BitWidth,
unsigned AddressSpace,		unsigned AddressSpace,
▲ Show 20 Lines • Show All 519 Lines • Show Last 20 Lines

lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 244 Lines • ▼ Show 20 Lines
	bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {			bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
	return TTIImpl->supportsEfficientVectorElementLoadStore();			return TTIImpl->supportsEfficientVectorElementLoadStore();
	}			}

	bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {			bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
	return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);			return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
	}			}

	bool TargetTransformInfo::enableMemCmpExpansion(unsigned &MaxLoadSize) const {			const TargetTransformInfo::MemCmpExpansionOptions *
	return TTIImpl->enableMemCmpExpansion(MaxLoadSize);			TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const {
				return TTIImpl->enableMemCmpExpansion(IsZeroCmp);
	}			}

	bool TargetTransformInfo::enableInterleavedAccessVectorization() const {			bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
	return TTIImpl->enableInterleavedAccessVectorization();			return TTIImpl->enableInterleavedAccessVectorization();
	}			}

	bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {			bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
	return TTIImpl->isFPVectorizationPotentiallyUnsafe();			return TTIImpl->isFPVectorizationPotentiallyUnsafe();
	▲ Show 20 Lines • Show All 938 Lines • Show Last 20 Lines

lib/CodeGen/CodeGenPrepare.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,752 Lines • ▼ Show 20 Lines	void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
unsigned &LoadIndex);		unsigned &LoadIndex);
void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);		void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
void emitMemCmpResultBlock();		void emitMemCmpResultBlock();
Value *getMemCmpExpansionZeroCase();		Value *getMemCmpExpansionZeroCase();
Value *getMemCmpEqZeroOneBlock();		Value *getMemCmpEqZeroOneBlock();
Value *getMemCmpOneBlock();		Value *getMemCmpOneBlock();

public:		public:
MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize,		MemCmpExpansion(CallInst *CI, uint64_t Size,
unsigned MaxNumLoads, unsigned NumLoadsPerBlock,		const TargetTransformInfo::MemCmpExpansionOptions &Options,
const DataLayout &DL);		unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
		unsigned NumLoadsPerBlock, const DataLayout &DL);

unsigned getNumBlocks();		unsigned getNumBlocks();
uint64_t getNumLoads() const { return LoadSequence.size(); }		uint64_t getNumLoads() const { return LoadSequence.size(); }

Value *getMemCmpExpansion();		Value *getMemCmpExpansion();
};		};

} // end anonymous namespace		} // end anonymous namespace

// Initialize the basic block structure required for expansion of memcmp call		// Initialize the basic block structure required for expansion of memcmp call
// with given maximum load size and memcmp size parameter.		// with given maximum load size and memcmp size parameter.
// This structure includes:		// This structure includes:
// 1. A list of load compare blocks - LoadCmpBlocks.		// 1. A list of load compare blocks - LoadCmpBlocks.
// 2. An EndBlock, split from original instruction point, which is the block to		// 2. An EndBlock, split from original instruction point, which is the block to
// return from.		// return from.
// 3. ResultBlock, block to branch to for early exit when a		// 3. ResultBlock, block to branch to for early exit when a
// LoadCmpBlock finds a difference.		// LoadCmpBlock finds a difference.
MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size,		MemCmpExpansion::MemCmpExpansion(
const unsigned MaxLoadSize,		CallInst *const CI, uint64_t Size,
const unsigned MaxNumLoads,		const TargetTransformInfo::MemCmpExpansionOptions &Options,
const unsigned LoadsPerBlock,		const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
const DataLayout &TheDataLayout)		const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
: CI(CI),		: CI(CI),
Size(Size),		Size(Size),
MaxLoadSize(MaxLoadSize),		MaxLoadSize(0),
NumLoadsNonOneByte(0),		NumLoadsNonOneByte(0),
NumLoadsPerBlock(LoadsPerBlock),		NumLoadsPerBlock(NumLoadsPerBlock),
IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)),		IsUsedForZeroCmp(IsUsedForZeroCmp),
DL(TheDataLayout),		DL(TheDataLayout),
Builder(CI) {		Builder(CI) {
assert(Size > 0 && "zero blocks");		assert(Size > 0 && "zero blocks");
// Scale the max size down if the target can load more bytes than we need.		// Scale the max size down if the target can load more bytes than we need.
while (this->MaxLoadSize > Size) {		size_t LoadSizeIndex = 0;
this->MaxLoadSize /= 2;		while (LoadSizeIndex < Options.LoadSizes.size() &&
		Options.LoadSizes[LoadSizeIndex] > Size) {
		++LoadSizeIndex;
}		}
		this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
// Compute the decomposition.		// Compute the decomposition.
unsigned LoadSize = this->MaxLoadSize;
uint64_t CurSize = Size;		uint64_t CurSize = Size;
uint64_t Offset = 0;		uint64_t Offset = 0;
while (CurSize) {		while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
		const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
assert(LoadSize > 0 && "zero load size");		assert(LoadSize > 0 && "zero load size");
const uint64_t NumLoadsForThisSize = CurSize / LoadSize;		const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {		if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
// Do not expand if the total number of loads is larger than what the		// Do not expand if the total number of loads is larger than what the
// target allows. Note that it's important that we exit before completing		// target allows. Note that it's important that we exit before completing
// the expansion to avoid using a ton of memory to store the expansion for		// the expansion to avoid using a ton of memory to store the expansion for
// large sizes.		// large sizes.
LoadSequence.clear();		LoadSequence.clear();
return;		return;
}		}
if (NumLoadsForThisSize > 0) {		if (NumLoadsForThisSize > 0) {
for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {		for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
LoadSequence.push_back({LoadSize, Offset});		LoadSequence.push_back({LoadSize, Offset});
Offset += LoadSize;		Offset += LoadSize;
}		}
if (LoadSize > 1) {		if (LoadSize > 1) {
++NumLoadsNonOneByte;		++NumLoadsNonOneByte;
}		}
CurSize = CurSize % LoadSize;		CurSize = CurSize % LoadSize;
}		}
// FIXME: This can result in a non-native load size (e.g. X86-32+SSE can		++LoadSizeIndex;
// load 16 and 4 but not 8), which throws the load count off (e.g. in the
// aforementioned case, 16 bytes will count for 2 loads but will generate
// 4).
LoadSize /= 2;
}		}
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");		assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
}		}

unsigned MemCmpExpansion::getNumBlocks() {		unsigned MemCmpExpansion::getNumBlocks() {
if (IsUsedForZeroCmp)		if (IsUsedForZeroCmp)
return getNumLoads() / NumLoadsPerBlock +		return getNumLoads() / NumLoadsPerBlock +
(getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);		(getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
▲ Show 20 Lines • Show All 520 Lines • ▼ Show 20 Lines	static bool expandMemCmp(CallInst CI, const TargetTransformInfo TTI,
}		}
const uint64_t SizeVal = SizeCast->getZExtValue();		const uint64_t SizeVal = SizeCast->getZExtValue();

if (SizeVal == 0) {		if (SizeVal == 0) {
return false;		return false;
}		}

// TTI call to check if target would like to expand memcmp. Also, get the		// TTI call to check if target would like to expand memcmp. Also, get the
// max LoadSize.		// available load sizes.
unsigned MaxLoadSize;		const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false;		const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
		if (!Options) return false;

const unsigned MaxNumLoads =		const unsigned MaxNumLoads =
TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());		TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());

MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MaxNumLoads,		MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
MemCmpNumLoadsPerBlock, *DL);		IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);

// Don't expand if this will require more loads than desired by the target.		// Don't expand if this will require more loads than desired by the target.
if (Expansion.getNumLoads() == 0) {		if (Expansion.getNumLoads() == 0) {
NumMemCmpGreaterThanMax++;		NumMemCmpGreaterThanMax++;
return false;		return false;
}		}

NumMemCmpInlined++;		NumMemCmpInlined++;
▲ Show 20 Lines • Show All 4,569 Lines • Show Last 20 Lines

lib/Target/PowerPC/PPCTargetTransformInfo.h

Show First 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);		TTI::UnrollingPreferences &UP);

/// @}		/// @}

/// \name Vector TTI Implementations		/// \name Vector TTI Implementations
/// @{		/// @{

bool enableAggressiveInterleaving(bool LoopHasReductions);		bool enableAggressiveInterleaving(bool LoopHasReductions);
bool enableMemCmpExpansion(unsigned &MaxLoadSize);		const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
		bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();		bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);		unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector) const;		unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize();		unsigned getCacheLineSize();
unsigned getPrefetchDistance();		unsigned getPrefetchDistance();
unsigned getMaxInterleaveFactor(unsigned VF);		unsigned getMaxInterleaveFactor(unsigned VF);
int getArithmeticInstrCost(		int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,		unsigned Opcode, Type *Ty,
Show All 25 Lines

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

Show First 20 Lines • Show All 220 Lines • ▼ Show 20 Lines	bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
// do so is particularly expensive. This makes it much more likely (compared		// do so is particularly expensive. This makes it much more likely (compared
// to only using concatenation unrolling).		// to only using concatenation unrolling).
if (ST->getDarwinDirective() == PPC::DIR_A2)		if (ST->getDarwinDirective() == PPC::DIR_A2)
return true;		return true;

return LoopHasReductions;		return LoopHasReductions;
}		}

bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {		const PPCTTIImpl::TTI::MemCmpExpansionOptions *
MaxLoadSize = 8;		PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
return true;		static const auto Options = []() {
		TTI::MemCmpExpansionOptions Options;
		Options.LoadSizes.push_back(8);
		Options.LoadSizes.push_back(4);
		Options.LoadSizes.push_back(2);
		Options.LoadSizes.push_back(1);
		return Options;
		}();
		return &Options;
}		}

bool PPCTTIImpl::enableInterleavedAccessVectorization() {		bool PPCTTIImpl::enableInterleavedAccessVectorization() {
return true;		return true;
}		}

unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {		unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
if (Vector && !ST->hasAltivec() && !ST->hasQPX())		if (Vector && !ST->hasAltivec() && !ST->hasQPX())
▲ Show 20 Lines • Show All 235 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.h

Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);		TargetTransformInfo::LSRCost &C2);
bool isLegalMaskedLoad(Type *DataType);		bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);		bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);		bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);		bool isLegalMaskedScatter(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);		bool hasDivRemOp(Type *DataType, bool IsSigned);
bool areInlineCompatible(const Function *Caller,		bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;		const Function *Callee) const;
bool enableMemCmpExpansion(unsigned &MaxLoadSize);		const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
		bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();		bool enableInterleavedAccessVectorization();
private:		private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,		int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
unsigned Alignment, unsigned AddressSpace);		unsigned Alignment, unsigned AddressSpace);
int getGSVectorCost(unsigned Opcode, Type DataTy, Value Ptr,		int getGSVectorCost(unsigned Opcode, Type DataTy, Value Ptr,
unsigned Alignment, unsigned AddressSpace);		unsigned Alignment, unsigned AddressSpace);

/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 2,530 Lines • ▼ Show 20 Lines	const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();		TM.getSubtargetImpl(*Callee)->getFeatureBits();

// FIXME: This is likely too limiting as it will include subtarget features		// FIXME: This is likely too limiting as it will include subtarget features
// that we might not care about for inlining, but it is conservatively		// that we might not care about for inlining, but it is conservatively
// correct.		// correct.
return (CallerBits & CalleeBits) == CalleeBits;		return (CallerBits & CalleeBits) == CalleeBits;
}		}

bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {		const X86TTIImpl::TTI::MemCmpExpansionOptions *
// TODO: We can increase these based on available vector ops.		X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
MaxLoadSize = ST->is64Bit() ? 8 : 4;		// Only enable vector loads for equality comparison.
return true;		// Right now the vector version is not as fast, see #33329.
		spatelUnsubmitted Done Reply Inline Actions This isn't strictly a question of bswap availability - see the suggested code in: https://bugs.llvm.org/show_bug.cgi?id=33329 spatel: This isn't strictly a question of bswap availability - see the suggested code in: https://bugs.
		static const auto ThreeWayOptions = [this]() {
		TTI::MemCmpExpansionOptions Options;
		if (ST->is64Bit()) {
		Options.LoadSizes.push_back(8);
		}
		Options.LoadSizes.push_back(4);
		Options.LoadSizes.push_back(2);
		Options.LoadSizes.push_back(1);
		return Options;
		}();
		static const auto EqZeroOptions = [this]() {
		TTI::MemCmpExpansionOptions Options;
		// TODO: enable AVX512 when the DAG is ready.
		spatelUnsubmitted Done Reply Inline Actions AVX512 needs to be a 'TODO' in this patch. We're going to need to add more tests (64- and 128-byte), and the DAG is not prepared for those patterns yet. spatel: AVX512 needs to be a 'TODO' in this patch. We're going to need to add more tests (64- and 128…
		// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
		if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
		spatelUnsubmitted Not Done Reply Inline Actions This isn't correct (or at least it doesn't match what the DAG handles optimally). I've added extra runs to memcmp.ll, so we can see what happens for SSE1/AVX1 vs. SSE2/AVX2. spatel: This isn't correct (or at least it doesn't match what the DAG handles optimally). I've added…
		courbetAuthorUnsubmitted Not Done Reply Inline Actions Right. I was basing this on getRegisterBitWidth), but now I see that the DAG can do something else for the sake of performance. So I switched to 16B for >=SSE2 and 32B for >= AVX2. Thanks for the tests. courbet: Right. I was basing this on getRegisterBitWidth), but now I see that the DAG can do something…
		if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
		if (ST->is64Bit()) {
		Options.LoadSizes.push_back(8);
		}
		Options.LoadSizes.push_back(4);
		Options.LoadSizes.push_back(2);
		Options.LoadSizes.push_back(1);
		return Options;
		}();
		return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
}		}

bool X86TTIImpl::enableInterleavedAccessVectorization() {		bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,		// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.		// but there are currently some unexplained performance artifacts on Atom.
// As a temporary solution, disable on Atom.		// As a temporary solution, disable on Atom.
return !(ST->isAtom());		return !(ST->isAtom());
}		}
▲ Show 20 Lines • Show All 260 Lines • Show Last 20 Lines

lib/Transforms/Scalar/MergeICmps.cpp

	Show First 20 Lines • Show All 619 Lines • ▼ Show 20 Lines
	};			};

	PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,			PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
	const TargetTransformInfo *TTI) {			const TargetTransformInfo *TTI) {
	DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");			DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");

	// We only try merging comparisons if the target wants to expand memcmp later.			// We only try merging comparisons if the target wants to expand memcmp later.
	// The rationale is to avoid turning small chains into memcmp calls.			// The rationale is to avoid turning small chains into memcmp calls.
	unsigned MaxLoadSize;			if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
	if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return PreservedAnalyses::all();

	bool MadeChange = false;			bool MadeChange = false;

	for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {			for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
	// A Phi operation is always first in a basic block.			// A Phi operation is always first in a basic block.
	if (auto const Phi = dyn_cast<PHINode>(&BBIt->begin()))			if (auto const Phi = dyn_cast<PHINode>(&BBIt->begin()))
	MadeChange \|= processPhi(*Phi, TLI);			MadeChange \|= processPhi(*Phi, TLI);
	}			}
	Show All 16 Lines

test/CodeGen/X86/memcmp-optsize.ll

	Show First 20 Lines • Show All 592 Lines • ▼ Show 20 Lines
	; X86-SSE2-NEXT: movdqu (%ecx), %xmm0			; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
	; X86-SSE2-NEXT: movdqu (%eax), %xmm1			; X86-SSE2-NEXT: movdqu (%eax), %xmm1
	; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1			; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X86-SSE2-NEXT: pmovmskb %xmm1, %eax			; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF			; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X86-SSE2-NEXT: setne %al			; X86-SSE2-NEXT: setne %al
	; X86-SSE2-NEXT: retl			; X86-SSE2-NEXT: retl
	;			;
	; X64-LABEL: length16_eq:			; X64-SSE2-LABEL: length16_eq:
	; X64: # BB#0: # %loadbb			; X64-SSE2: # BB#0:
	; X64-NEXT: movq (%rdi), %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: cmpq (%rsi), %rax			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-NEXT: jne .LBB17_1			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X64-NEXT: # BB#2: # %loadbb1			; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X64-NEXT: movq 8(%rdi), %rcx			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: xorl %eax, %eax			; X64-SSE2-NEXT: setne %al
	; X64-NEXT: cmpq 8(%rsi), %rcx			; X64-SSE2-NEXT: retq
	; X64-NEXT: je .LBB17_3			;
	; X64-NEXT: .LBB17_1: # %res_block			; X64-AVX2-LABEL: length16_eq:
	; X64-NEXT: movl $1, %eax			; X64-AVX2: # BB#0:
	; X64-NEXT: .LBB17_3: # %endblock			; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
	; X64-NEXT: testl %eax, %eax			; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
	; X64-NEXT: setne %al			; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
	; X64-NEXT: retq			; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX2-NEXT: setne %al
				; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
	%cmp = icmp ne i32 %call, 0			%cmp = icmp ne i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length16_eq_const(i8* %X) nounwind optsize {			define i1 @length16_eq_const(i8* %X) nounwind optsize {
	; X86-NOSSE-LABEL: length16_eq_const:			; X86-NOSSE-LABEL: length16_eq_const:
	; X86-NOSSE: # BB#0:			; X86-NOSSE: # BB#0:
	Show All 12 Lines
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: movdqu (%eax), %xmm0			; X86-SSE2-NEXT: movdqu (%eax), %xmm0
	; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0			; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
	; X86-SSE2-NEXT: pmovmskb %xmm0, %eax			; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF			; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X86-SSE2-NEXT: sete %al			; X86-SSE2-NEXT: sete %al
	; X86-SSE2-NEXT: retl			; X86-SSE2-NEXT: retl
	;			;
	; X64-LABEL: length16_eq_const:			; X64-SSE2-LABEL: length16_eq_const:
	; X64: # BB#0: # %loadbb			; X64-SSE2: # BB#0:
	; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: cmpq %rax, (%rdi)			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-NEXT: jne .LBB18_1			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-NEXT: # BB#2: # %loadbb1			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: xorl %eax, %eax			; X64-SSE2-NEXT: sete %al
	; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938			; X64-SSE2-NEXT: retq
	; X64-NEXT: cmpq %rcx, 8(%rdi)			;
	; X64-NEXT: je .LBB18_3			; X64-AVX2-LABEL: length16_eq_const:
	; X64-NEXT: .LBB18_1: # %res_block			; X64-AVX2: # BB#0:
	; X64-NEXT: movl $1, %eax			; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
	; X64-NEXT: .LBB18_3: # %endblock			; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: testl %eax, %eax			; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
	; X64-NEXT: sete %al			; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: retq			; X64-AVX2-NEXT: sete %al
				; X64-AVX2-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
	%c = icmp eq i32 %m, 0			%c = icmp eq i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914			; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914

	define i32 @length24(i8* %X, i8* %Y) nounwind optsize {			define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
	Show All 23 Lines
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NEXT: sete %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length24_eq:			; X64-SSE2-LABEL: length24_eq:
	; X64: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: movl $24, %edx			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X64-NEXT: sete %al			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: jne .LBB20_1
	; X64-NEXT: retq			; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movq 16(%rdi), %rcx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
				; X64-SSE2-NEXT: je .LBB20_3
				; X64-SSE2-NEXT: .LBB20_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB20_3: # %endblock
				; X64-SSE2-NEXT: testl %eax, %eax
				; X64-SSE2-NEXT: sete %al
				; X64-SSE2-NEXT: retq
				;
				; X64-AVX2-LABEL: length24_eq:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
				; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
				; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
				; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX2-NEXT: jne .LBB20_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: movq 16(%rdi), %rcx
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
				; X64-AVX2-NEXT: je .LBB20_3
				; X64-AVX2-NEXT: .LBB20_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB20_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: sete %al
				; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length24_eq_const(i8* %X) nounwind optsize {			define i1 @length24_eq_const(i8* %X) nounwind optsize {
	; X86-LABEL: length24_eq_const:			; X86-LABEL: length24_eq_const:
	; X86: # BB#0:			; X86: # BB#0:
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: pushl $24			; X86-NEXT: pushl $24
	; X86-NEXT: pushl $.L.str			; X86-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NEXT: setne %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length24_eq_const:			; X64-SSE2-LABEL: length24_eq_const:
	; X64: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-NEXT: movl $24, %edx			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: jne .LBB21_1
	; X64-NEXT: setne %al			; X64-SSE2-NEXT: # BB#2: # %loadbb1
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: xorl %eax, %eax
	; X64-NEXT: retq			; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
				; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
				; X64-SSE2-NEXT: je .LBB21_3
				; X64-SSE2-NEXT: .LBB21_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB21_3: # %endblock
				; X64-SSE2-NEXT: testl %eax, %eax
				; X64-SSE2-NEXT: setne %al
				; X64-SSE2-NEXT: retq
				;
				; X64-AVX2-LABEL: length24_eq_const:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
				; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
				; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
				; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX2-NEXT: jne .LBB21_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
				; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
				; X64-AVX2-NEXT: je .LBB21_3
				; X64-AVX2-NEXT: .LBB21_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB21_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: setne %al
				; X64-AVX2-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
	%c = icmp ne i32 %m, 0			%c = icmp ne i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	define i32 @length32(i8* %X, i8* %Y) nounwind optsize {			define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
	; X86-LABEL: length32:			; X86-LABEL: length32:
	; X86: # BB#0:			; X86: # BB#0:
	Show All 11 Lines
	; X64-NEXT: jmp memcmp # TAILCALL			; X64-NEXT: jmp memcmp # TAILCALL
	%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
	ret i32 %m			ret i32 %m
	}			}

	; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325			; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325

	define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {			define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
	; X86-LABEL: length32_eq:			; X86-NOSSE-LABEL: length32_eq:
	; X86: # BB#0:			; X86-NOSSE: # BB#0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $32			; X86-NOSSE-NEXT: pushl $32
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NOSSE-NEXT: sete %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE2-LABEL: length32_eq:
				; X86-SSE2: # BB#0: # %loadbb
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
				; X86-SSE2-NEXT: movdqu (%eax), %xmm1
				; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
				; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
				; X86-SSE2-NEXT: jne .LBB23_1
				; X86-SSE2-NEXT: # BB#2: # %loadbb1
				; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
				; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
				; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: je .LBB23_3
				; X86-SSE2-NEXT: .LBB23_1: # %res_block
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: incl %eax
				; X86-SSE2-NEXT: .LBB23_3: # %endblock
				; X86-SSE2-NEXT: testl %eax, %eax
				; X86-SSE2-NEXT: sete %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length32_eq:			; X64-SSE2-LABEL: length32_eq:
	; X64-SSE2: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-SSE2-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movl $32, %edx			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-SSE2-NEXT: callq memcmp			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
				; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-SSE2-NEXT: jne .LBB23_1
				; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
				; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
				; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-SSE2-NEXT: je .LBB23_3
				; X64-SSE2-NEXT: .LBB23_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB23_3: # %endblock
	; X64-SSE2-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-SSE2-NEXT: sete %al			; X64-SSE2-NEXT: sete %al
	; X64-SSE2-NEXT: popq %rcx
	; X64-SSE2-NEXT: retq			; X64-SSE2-NEXT: retq
	;			;
	; X64-AVX2-LABEL: length32_eq:			; X64-AVX2-LABEL: length32_eq:
	; X64-AVX2: # BB#0:			; X64-AVX2: # BB#0:
	; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0			; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
	; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0			; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
	; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax			; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
	; X64-AVX2-NEXT: cmpl $-1, %eax			; X64-AVX2-NEXT: cmpl $-1, %eax
	; X64-AVX2-NEXT: sete %al			; X64-AVX2-NEXT: sete %al
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length32_eq_const(i8* %X) nounwind optsize {			define i1 @length32_eq_const(i8* %X) nounwind optsize {
	; X86-LABEL: length32_eq_const:			; X86-NOSSE-LABEL: length32_eq_const:
	; X86: # BB#0:			; X86-NOSSE: # BB#0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $32			; X86-NOSSE-NEXT: pushl $32
	; X86-NEXT: pushl $.L.str			; X86-NOSSE-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NOSSE-NEXT: setne %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE2-LABEL: length32_eq_const:
				; X86-SSE2: # BB#0: # %loadbb
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movdqu (%eax), %xmm0
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: jne .LBB24_1
				; X86-SSE2-NEXT: # BB#2: # %loadbb1
				; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: je .LBB24_3
				; X86-SSE2-NEXT: .LBB24_1: # %res_block
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: incl %eax
				; X86-SSE2-NEXT: .LBB24_3: # %endblock
				; X86-SSE2-NEXT: testl %eax, %eax
				; X86-SSE2-NEXT: setne %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length32_eq_const:			; X64-SSE2-LABEL: length32_eq_const:
	; X64-SSE2: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-SSE2-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-SSE2-NEXT: movl $32, %edx			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-SSE2-NEXT: callq memcmp			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-SSE2-NEXT: jne .LBB24_1
				; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
				; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
				; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-SSE2-NEXT: je .LBB24_3
				; X64-SSE2-NEXT: .LBB24_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB24_3: # %endblock
	; X64-SSE2-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-SSE2-NEXT: setne %al			; X64-SSE2-NEXT: setne %al
	; X64-SSE2-NEXT: popq %rcx
	; X64-SSE2-NEXT: retq			; X64-SSE2-NEXT: retq
	;			;
	; X64-AVX2-LABEL: length32_eq_const:			; X64-AVX2-LABEL: length32_eq_const:
	; X64-AVX2: # BB#0:			; X64-AVX2: # BB#0:
	; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0			; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
	; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0			; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
	; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax			; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
	; X64-AVX2-NEXT: cmpl $-1, %eax			; X64-AVX2-NEXT: cmpl $-1, %eax
	Show All 32 Lines
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NEXT: setne %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length64_eq:			; X64-SSE2-LABEL: length64_eq:
	; X64: # BB#0:			; X64-SSE2: # BB#0:
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: pushq %rax
	; X64-NEXT: movl $64, %edx			; X64-SSE2-NEXT: movl $64, %edx
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: callq memcmp
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-NEXT: setne %al			; X64-SSE2-NEXT: setne %al
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: popq %rcx
	; X64-NEXT: retq			; X64-SSE2-NEXT: retq
				;
				; X64-AVX2-LABEL: length64_eq:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
				; X64-AVX2-NEXT: cmpl $-1, %eax
				; X64-AVX2-NEXT: jne .LBB26_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: cmpl $-1, %ecx
				; X64-AVX2-NEXT: je .LBB26_3
				; X64-AVX2-NEXT: .LBB26_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB26_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: setne %al
				; X64-AVX2-NEXT: vzeroupper
				; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
	%cmp = icmp ne i32 %call, 0			%cmp = icmp ne i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length64_eq_const(i8* %X) nounwind optsize {			define i1 @length64_eq_const(i8* %X) nounwind optsize {
	; X86-LABEL: length64_eq_const:			; X86-LABEL: length64_eq_const:
	; X86: # BB#0:			; X86: # BB#0:
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: pushl $64			; X86-NEXT: pushl $64
	; X86-NEXT: pushl $.L.str			; X86-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NEXT: sete %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length64_eq_const:			; X64-SSE2-LABEL: length64_eq_const:
	; X64: # BB#0:			; X64-SSE2: # BB#0:
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: pushq %rax
	; X64-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: movl $.L.str, %esi
	; X64-NEXT: movl $64, %edx			; X64-SSE2-NEXT: movl $64, %edx
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: callq memcmp
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-NEXT: sete %al			; X64-SSE2-NEXT: sete %al
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: popq %rcx
	; X64-NEXT: retq			; X64-SSE2-NEXT: retq
				;
				; X64-AVX2-LABEL: length64_eq_const:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
				; X64-AVX2-NEXT: cmpl $-1, %eax
				; X64-AVX2-NEXT: jne .LBB27_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: cmpl $-1, %ecx
				; X64-AVX2-NEXT: je .LBB27_3
				; X64-AVX2-NEXT: .LBB27_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB27_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: sete %al
				; X64-AVX2-NEXT: vzeroupper
				; X64-AVX2-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
	%c = icmp eq i32 %m, 0			%c = icmp eq i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

test/CodeGen/X86/memcmp.ll

	Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-NEXT: movzwl (%ecx), %ecx			; X86-NEXT: movzwl (%ecx), %ecx
	; X86-NEXT: movzwl (%eax), %edx			; X86-NEXT: movzwl (%eax), %edx
	; X86-NEXT: rolw $8, %cx			; X86-NEXT: rolw $8, %cx
	; X86-NEXT: rolw $8, %dx			; X86-NEXT: rolw $8, %dx
	; X86-NEXT: movzwl %cx, %eax			; X86-NEXT: movzwl %cx, %eax
	; X86-NEXT: movzwl %dx, %ecx			; X86-NEXT: movzwl %dx, %ecx
	; X86-NEXT: subl %ecx, %eax			; X86-NEXT: subl %ecx, %eax
	; X86-NEXT: retl			; X86-NEXT: retl
				spatelUnsubmitted Not Done Reply Inline Actions Apologies for all this noise. I updated the script again at rL316443 to avoid the 'ret' diffs. I also added extra run lines for SSE1 and AVX1 at rL316446, so you'll need to rebase. spatel: Apologies for all this noise. I updated the script again at rL316443 to avoid the 'ret' diffs.
	;			;
	; X64-LABEL: length2:			; X64-LABEL: length2:
	; X64: # BB#0:			; X64: # BB#0:
	; X64-NEXT: movzwl (%rdi), %eax			; X64-NEXT: movzwl (%rdi), %eax
	; X64-NEXT: movzwl (%rsi), %ecx			; X64-NEXT: movzwl (%rsi), %ecx
	; X64-NEXT: rolw $8, %ax			; X64-NEXT: rolw $8, %ax
	; X64-NEXT: rolw $8, %cx			; X64-NEXT: rolw $8, %cx
	; X64-NEXT: movzwl %ax, %eax			; X64-NEXT: movzwl %ax, %eax
	▲ Show 20 Lines • Show All 568 Lines • ▼ Show 20 Lines
	; X86-SSE2-NEXT: movdqu (%ecx), %xmm0			; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
	; X86-SSE2-NEXT: movdqu (%eax), %xmm1			; X86-SSE2-NEXT: movdqu (%eax), %xmm1
	; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1			; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X86-SSE2-NEXT: pmovmskb %xmm1, %eax			; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF			; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X86-SSE2-NEXT: setne %al			; X86-SSE2-NEXT: setne %al
	; X86-SSE2-NEXT: retl			; X86-SSE2-NEXT: retl
	;			;
	; X64-LABEL: length16_eq:			; X64-SSE2-LABEL: length16_eq:
	; X64: # BB#0: # %loadbb			; X64-SSE2: # BB#0:
	; X64-NEXT: movq (%rdi), %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: cmpq (%rsi), %rax			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-NEXT: jne .LBB19_1			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X64-NEXT: # BB#2: # %loadbb1			; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X64-NEXT: movq 8(%rdi), %rcx			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: xorl %eax, %eax			; X64-SSE2-NEXT: setne %al
	; X64-NEXT: cmpq 8(%rsi), %rcx			; X64-SSE2-NEXT: retq
	; X64-NEXT: je .LBB19_3			;
	; X64-NEXT: .LBB19_1: # %res_block			; X64-AVX-LABEL: length16_eq:
	; X64-NEXT: movl $1, %eax			; X64-AVX: # BB#0:
	; X64-NEXT: .LBB19_3: # %endblock			; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
	; X64-NEXT: testl %eax, %eax			; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
	; X64-NEXT: setne %al			; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
	; X64-NEXT: retq			; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX-NEXT: setne %al
				; X64-AVX-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
	%cmp = icmp ne i32 %call, 0			%cmp = icmp ne i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length16_eq_const(i8* %X) nounwind {			define i1 @length16_eq_const(i8* %X) nounwind {
	; X86-NOSSE-LABEL: length16_eq_const:			; X86-NOSSE-LABEL: length16_eq_const:
	; X86-NOSSE: # BB#0:			; X86-NOSSE: # BB#0:
	Show All 24 Lines
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: movdqu (%eax), %xmm0			; X86-SSE2-NEXT: movdqu (%eax), %xmm0
	; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0			; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
	; X86-SSE2-NEXT: pmovmskb %xmm0, %eax			; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF			; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X86-SSE2-NEXT: sete %al			; X86-SSE2-NEXT: sete %al
	; X86-SSE2-NEXT: retl			; X86-SSE2-NEXT: retl
	;			;
	; X64-LABEL: length16_eq_const:			; X64-SSE2-LABEL: length16_eq_const:
	; X64: # BB#0: # %loadbb			; X64-SSE2: # BB#0:
	; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: cmpq %rax, (%rdi)			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-NEXT: jne .LBB20_1			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-NEXT: # BB#2: # %loadbb1			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: xorl %eax, %eax			; X64-SSE2-NEXT: sete %al
	; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938			; X64-SSE2-NEXT: retq
	; X64-NEXT: cmpq %rcx, 8(%rdi)			;
	; X64-NEXT: je .LBB20_3			; X64-AVX-LABEL: length16_eq_const:
	; X64-NEXT: .LBB20_1: # %res_block			; X64-AVX: # BB#0:
	; X64-NEXT: movl $1, %eax			; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
	; X64-NEXT: .LBB20_3: # %endblock			; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
	; X64-NEXT: testl %eax, %eax			; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
	; X64-NEXT: sete %al			; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: retq			; X64-AVX-NEXT: sete %al
				; X64-AVX-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
	%c = icmp eq i32 %m, 0			%c = icmp eq i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914			; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914

	define i32 @length24(i8* %X, i8* %Y) nounwind {			define i32 @length24(i8* %X, i8* %Y) nounwind {
	Show All 23 Lines
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NEXT: sete %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length24_eq:			; X64-SSE2-LABEL: length24_eq:
	; X64: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: movl $24, %edx			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
	; X64-NEXT: sete %al			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: jne .LBB22_1
	; X64-NEXT: retq			; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movq 16(%rdi), %rcx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
				; X64-SSE2-NEXT: je .LBB22_3
				; X64-SSE2-NEXT: .LBB22_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB22_3: # %endblock
				; X64-SSE2-NEXT: testl %eax, %eax
				; X64-SSE2-NEXT: sete %al
				; X64-SSE2-NEXT: retq
				;
				; X64-AVX-LABEL: length24_eq:
				; X64-AVX: # BB#0: # %loadbb
				; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
				; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
				; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
				; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX-NEXT: jne .LBB22_1
				; X64-AVX-NEXT: # BB#2: # %loadbb1
				; X64-AVX-NEXT: movq 16(%rdi), %rcx
				; X64-AVX-NEXT: xorl %eax, %eax
				; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
				; X64-AVX-NEXT: je .LBB22_3
				; X64-AVX-NEXT: .LBB22_1: # %res_block
				; X64-AVX-NEXT: movl $1, %eax
				; X64-AVX-NEXT: .LBB22_3: # %endblock
				; X64-AVX-NEXT: testl %eax, %eax
				; X64-AVX-NEXT: sete %al
				; X64-AVX-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length24_eq_const(i8* %X) nounwind {			define i1 @length24_eq_const(i8* %X) nounwind {
	; X86-LABEL: length24_eq_const:			; X86-LABEL: length24_eq_const:
	; X86: # BB#0:			; X86: # BB#0:
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: pushl $24			; X86-NEXT: pushl $24
	; X86-NEXT: pushl $.L.str			; X86-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NEXT: setne %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length24_eq_const:			; X64-SSE2-LABEL: length24_eq_const:
	; X64: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-NEXT: movl $24, %edx			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: jne .LBB23_1
	; X64-NEXT: setne %al			; X64-SSE2-NEXT: # BB#2: # %loadbb1
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: xorl %eax, %eax
	; X64-NEXT: retq			; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
				; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
				; X64-SSE2-NEXT: je .LBB23_3
				; X64-SSE2-NEXT: .LBB23_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB23_3: # %endblock
				; X64-SSE2-NEXT: testl %eax, %eax
				; X64-SSE2-NEXT: setne %al
				; X64-SSE2-NEXT: retq
				;
				; X64-AVX-LABEL: length24_eq_const:
				; X64-AVX: # BB#0: # %loadbb
				; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
				; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
				; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
				; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-AVX-NEXT: jne .LBB23_1
				; X64-AVX-NEXT: # BB#2: # %loadbb1
				; X64-AVX-NEXT: xorl %eax, %eax
				; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
				; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
				; X64-AVX-NEXT: je .LBB23_3
				; X64-AVX-NEXT: .LBB23_1: # %res_block
				; X64-AVX-NEXT: movl $1, %eax
				; X64-AVX-NEXT: .LBB23_3: # %endblock
				; X64-AVX-NEXT: testl %eax, %eax
				; X64-AVX-NEXT: setne %al
				; X64-AVX-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
	%c = icmp ne i32 %m, 0			%c = icmp ne i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	define i32 @length32(i8* %X, i8* %Y) nounwind {			define i32 @length32(i8* %X, i8* %Y) nounwind {
	; X86-LABEL: length32:			; X86-LABEL: length32:
	; X86: # BB#0:			; X86: # BB#0:
	Show All 11 Lines
	; X64-NEXT: jmp memcmp # TAILCALL			; X64-NEXT: jmp memcmp # TAILCALL
	%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
	ret i32 %m			ret i32 %m
	}			}

	; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325			; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325

	define i1 @length32_eq(i8* %x, i8* %y) nounwind {			define i1 @length32_eq(i8* %x, i8* %y) nounwind {
	; X86-LABEL: length32_eq:			; X86-NOSSE-LABEL: length32_eq:
	; X86: # BB#0:			; X86-NOSSE: # BB#0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $32			; X86-NOSSE-NEXT: pushl $32
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NOSSE-NEXT: sete %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE1-LABEL: length32_eq:
				; X86-SSE1: # BB#0:
				; X86-SSE1-NEXT: pushl $0
				; X86-SSE1-NEXT: pushl $32
				; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
				; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
				; X86-SSE1-NEXT: calll memcmp
				; X86-SSE1-NEXT: addl $16, %esp
				; X86-SSE1-NEXT: testl %eax, %eax
				; X86-SSE1-NEXT: sete %al
				; X86-SSE1-NEXT: retl
				;
				; X86-SSE2-LABEL: length32_eq:
				; X86-SSE2: # BB#0: # %loadbb
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
				; X86-SSE2-NEXT: movdqu (%eax), %xmm1
				; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
				; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
				; X86-SSE2-NEXT: jne .LBB25_1
				; X86-SSE2-NEXT: # BB#2: # %loadbb1
				; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
				; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
				; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: je .LBB25_3
				; X86-SSE2-NEXT: .LBB25_1: # %res_block
				; X86-SSE2-NEXT: movl $1, %eax
				; X86-SSE2-NEXT: .LBB25_3: # %endblock
				; X86-SSE2-NEXT: testl %eax, %eax
				; X86-SSE2-NEXT: sete %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length32_eq:			; X64-SSE2-LABEL: length32_eq:
	; X64-SSE2: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-SSE2-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movl $32, %edx			; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
	; X64-SSE2-NEXT: callq memcmp			; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
				; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-SSE2-NEXT: jne .LBB25_1
				; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
				; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
				; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
				; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-SSE2-NEXT: je .LBB25_3
				; X64-SSE2-NEXT: .LBB25_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB25_3: # %endblock
	; X64-SSE2-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-SSE2-NEXT: sete %al			; X64-SSE2-NEXT: sete %al
	; X64-SSE2-NEXT: popq %rcx
	; X64-SSE2-NEXT: retq			; X64-SSE2-NEXT: retq
	;			;
	; X64-AVX1-LABEL: length32_eq:			; X64-AVX1-LABEL: length32_eq:
	; X64-AVX1: # BB#0:			; X64-AVX1: # BB#0: # %loadbb
	; X64-AVX1-NEXT: movq 16(%rdi), %rax			; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
	; X64-AVX1-NEXT: movq (%rdi), %rcx			; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
	; X64-AVX1-NEXT: movq 8(%rdi), %rdx			; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
	; X64-AVX1-NEXT: movq 24(%rdi), %rdi			; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-AVX1-NEXT: xorq 24(%rsi), %rdi			; X64-AVX1-NEXT: jne .LBB25_1
	; X64-AVX1-NEXT: xorq 8(%rsi), %rdx			; X64-AVX1-NEXT: # BB#2: # %loadbb1
	; X64-AVX1-NEXT: orq %rdi, %rdx			; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
	; X64-AVX1-NEXT: xorq 16(%rsi), %rax			; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
	; X64-AVX1-NEXT: xorq (%rsi), %rcx			; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
	; X64-AVX1-NEXT: orq %rax, %rcx			; X64-AVX1-NEXT: xorl %eax, %eax
	; X64-AVX1-NEXT: orq %rdx, %rcx			; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-AVX1-NEXT: je .LBB25_3
				; X64-AVX1-NEXT: .LBB25_1: # %res_block
				; X64-AVX1-NEXT: movl $1, %eax
				; X64-AVX1-NEXT: .LBB25_3: # %endblock
				; X64-AVX1-NEXT: testl %eax, %eax
	; X64-AVX1-NEXT: sete %al			; X64-AVX1-NEXT: sete %al
	; X64-AVX1-NEXT: retq			; X64-AVX1-NEXT: retq
	;			;
	; X64-AVX2-LABEL: length32_eq:			; X64-AVX2-LABEL: length32_eq:
	; X64-AVX2: # BB#0:			; X64-AVX2: # BB#0:
	; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0			; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
	; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0			; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
	; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax			; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
	; X64-AVX2-NEXT: cmpl $-1, %eax			; X64-AVX2-NEXT: cmpl $-1, %eax
	; X64-AVX2-NEXT: sete %al			; X64-AVX2-NEXT: sete %al
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length32_eq_const(i8* %X) nounwind {			define i1 @length32_eq_const(i8* %X) nounwind {
	; X86-LABEL: length32_eq_const:			; X86-NOSSE-LABEL: length32_eq_const:
	; X86: # BB#0:			; X86-NOSSE: # BB#0:
	; X86-NEXT: pushl $0			; X86-NOSSE-NEXT: pushl $0
	; X86-NEXT: pushl $32			; X86-NOSSE-NEXT: pushl $32
	; X86-NEXT: pushl $.L.str			; X86-NOSSE-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NOSSE-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NOSSE-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NOSSE-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NOSSE-NEXT: setne %al
	; X86-NEXT: retl			; X86-NOSSE-NEXT: retl
				;
				; X86-SSE1-LABEL: length32_eq_const:
				; X86-SSE1: # BB#0:
				; X86-SSE1-NEXT: pushl $0
				; X86-SSE1-NEXT: pushl $32
				; X86-SSE1-NEXT: pushl $.L.str
				; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
				; X86-SSE1-NEXT: calll memcmp
				; X86-SSE1-NEXT: addl $16, %esp
				; X86-SSE1-NEXT: testl %eax, %eax
				; X86-SSE1-NEXT: setne %al
				; X86-SSE1-NEXT: retl
				;
				; X86-SSE2-LABEL: length32_eq_const:
				; X86-SSE2: # BB#0: # %loadbb
				; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-SSE2-NEXT: movdqu (%eax), %xmm0
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: jne .LBB26_1
				; X86-SSE2-NEXT: # BB#2: # %loadbb1
				; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
				; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
				; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X86-SSE2-NEXT: xorl %eax, %eax
				; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X86-SSE2-NEXT: je .LBB26_3
				; X86-SSE2-NEXT: .LBB26_1: # %res_block
				; X86-SSE2-NEXT: movl $1, %eax
				; X86-SSE2-NEXT: .LBB26_3: # %endblock
				; X86-SSE2-NEXT: testl %eax, %eax
				; X86-SSE2-NEXT: setne %al
				; X86-SSE2-NEXT: retl
	;			;
	; X64-SSE2-LABEL: length32_eq_const:			; X64-SSE2-LABEL: length32_eq_const:
	; X64-SSE2: # BB#0:			; X64-SSE2: # BB#0: # %loadbb
	; X64-SSE2-NEXT: pushq %rax			; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
	; X64-SSE2-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
	; X64-SSE2-NEXT: movl $32, %edx			; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
	; X64-SSE2-NEXT: callq memcmp			; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
				; X64-SSE2-NEXT: jne .LBB26_1
				; X64-SSE2-NEXT: # BB#2: # %loadbb1
				; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
				; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
				; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
				; X64-SSE2-NEXT: xorl %eax, %eax
				; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-SSE2-NEXT: je .LBB26_3
				; X64-SSE2-NEXT: .LBB26_1: # %res_block
				; X64-SSE2-NEXT: movl $1, %eax
				; X64-SSE2-NEXT: .LBB26_3: # %endblock
	; X64-SSE2-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-SSE2-NEXT: setne %al			; X64-SSE2-NEXT: setne %al
	; X64-SSE2-NEXT: popq %rcx
	; X64-SSE2-NEXT: retq			; X64-SSE2-NEXT: retq
	;			;
	; X64-AVX1-LABEL: length32_eq_const:			; X64-AVX1-LABEL: length32_eq_const:
	; X64-AVX1: # BB#0:			; X64-AVX1: # BB#0: # %loadbb
	; X64-AVX1-NEXT: movabsq $3544395820347831604, %rax # imm = 0x3130393837363534			; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
	; X64-AVX1-NEXT: xorq 24(%rdi), %rax			; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
	; X64-AVX1-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938			; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
	; X64-AVX1-NEXT: xorq 8(%rdi), %rcx			; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
	; X64-AVX1-NEXT: orq %rax, %rcx			; X64-AVX1-NEXT: jne .LBB26_1
	; X64-AVX1-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736			; X64-AVX1-NEXT: # BB#2: # %loadbb1
	; X64-AVX1-NEXT: xorq 16(%rdi), %rax			; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
	; X64-AVX1-NEXT: movabsq $3978425819141910832, %rdx # imm = 0x3736353433323130			; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
	; X64-AVX1-NEXT: xorq (%rdi), %rdx			; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
	; X64-AVX1-NEXT: orq %rax, %rdx			; X64-AVX1-NEXT: xorl %eax, %eax
	; X64-AVX1-NEXT: orq %rcx, %rdx			; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
				; X64-AVX1-NEXT: je .LBB26_3
				; X64-AVX1-NEXT: .LBB26_1: # %res_block
				; X64-AVX1-NEXT: movl $1, %eax
				; X64-AVX1-NEXT: .LBB26_3: # %endblock
				; X64-AVX1-NEXT: testl %eax, %eax
	; X64-AVX1-NEXT: setne %al			; X64-AVX1-NEXT: setne %al
	; X64-AVX1-NEXT: retq			; X64-AVX1-NEXT: retq
	;			;
	; X64-AVX2-LABEL: length32_eq_const:			; X64-AVX2-LABEL: length32_eq_const:
	; X64-AVX2: # BB#0:			; X64-AVX2: # BB#0:
	; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0			; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
	; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0			; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
	; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax			; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
	Show All 33 Lines
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: setne %al			; X86-NEXT: setne %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length64_eq:			; X64-SSE2-LABEL: length64_eq:
	; X64: # BB#0:			; X64-SSE2: # BB#0:
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: pushq %rax
	; X64-NEXT: movl $64, %edx			; X64-SSE2-NEXT: movl $64, %edx
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: callq memcmp
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-NEXT: setne %al			; X64-SSE2-NEXT: setne %al
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: popq %rcx
	; X64-NEXT: retq			; X64-SSE2-NEXT: retq
				;
				; X64-AVX1-LABEL: length64_eq:
				; X64-AVX1: # BB#0:
				; X64-AVX1-NEXT: pushq %rax
				; X64-AVX1-NEXT: movl $64, %edx
				; X64-AVX1-NEXT: callq memcmp
				; X64-AVX1-NEXT: testl %eax, %eax
				; X64-AVX1-NEXT: setne %al
				; X64-AVX1-NEXT: popq %rcx
				; X64-AVX1-NEXT: retq
				;
				; X64-AVX2-LABEL: length64_eq:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
				; X64-AVX2-NEXT: cmpl $-1, %eax
				; X64-AVX2-NEXT: jne .LBB28_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: cmpl $-1, %ecx
				; X64-AVX2-NEXT: je .LBB28_3
				; X64-AVX2-NEXT: .LBB28_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB28_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: setne %al
				; X64-AVX2-NEXT: vzeroupper
				; X64-AVX2-NEXT: retq
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
	%cmp = icmp ne i32 %call, 0			%cmp = icmp ne i32 %call, 0
	ret i1 %cmp			ret i1 %cmp
	}			}

	define i1 @length64_eq_const(i8* %X) nounwind {			define i1 @length64_eq_const(i8* %X) nounwind {
	; X86-LABEL: length64_eq_const:			; X86-LABEL: length64_eq_const:
	; X86: # BB#0:			; X86: # BB#0:
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: pushl $64			; X86-NEXT: pushl $64
	; X86-NEXT: pushl $.L.str			; X86-NEXT: pushl $.L.str
	; X86-NEXT: pushl {{[0-9]+}}(%esp)			; X86-NEXT: pushl {{[0-9]+}}(%esp)
	; X86-NEXT: calll memcmp			; X86-NEXT: calll memcmp
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: testl %eax, %eax			; X86-NEXT: testl %eax, %eax
	; X86-NEXT: sete %al			; X86-NEXT: sete %al
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: length64_eq_const:			; X64-SSE2-LABEL: length64_eq_const:
	; X64: # BB#0:			; X64-SSE2: # BB#0:
	; X64-NEXT: pushq %rax			; X64-SSE2-NEXT: pushq %rax
	; X64-NEXT: movl $.L.str, %esi			; X64-SSE2-NEXT: movl $.L.str, %esi
	; X64-NEXT: movl $64, %edx			; X64-SSE2-NEXT: movl $64, %edx
	; X64-NEXT: callq memcmp			; X64-SSE2-NEXT: callq memcmp
	; X64-NEXT: testl %eax, %eax			; X64-SSE2-NEXT: testl %eax, %eax
	; X64-NEXT: sete %al			; X64-SSE2-NEXT: sete %al
	; X64-NEXT: popq %rcx			; X64-SSE2-NEXT: popq %rcx
	; X64-NEXT: retq			; X64-SSE2-NEXT: retq
				;
				; X64-AVX1-LABEL: length64_eq_const:
				; X64-AVX1: # BB#0:
				; X64-AVX1-NEXT: pushq %rax
				; X64-AVX1-NEXT: movl $.L.str, %esi
				; X64-AVX1-NEXT: movl $64, %edx
				; X64-AVX1-NEXT: callq memcmp
				; X64-AVX1-NEXT: testl %eax, %eax
				; X64-AVX1-NEXT: sete %al
				; X64-AVX1-NEXT: popq %rcx
				; X64-AVX1-NEXT: retq
				;
				; X64-AVX2-LABEL: length64_eq_const:
				; X64-AVX2: # BB#0: # %loadbb
				; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
				; X64-AVX2-NEXT: cmpl $-1, %eax
				; X64-AVX2-NEXT: jne .LBB29_1
				; X64-AVX2-NEXT: # BB#2: # %loadbb1
				; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
				; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
				; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
				; X64-AVX2-NEXT: xorl %eax, %eax
				; X64-AVX2-NEXT: cmpl $-1, %ecx
				; X64-AVX2-NEXT: je .LBB29_3
				; X64-AVX2-NEXT: .LBB29_1: # %res_block
				; X64-AVX2-NEXT: movl $1, %eax
				; X64-AVX2-NEXT: .LBB29_3: # %endblock
				; X64-AVX2-NEXT: testl %eax, %eax
				; X64-AVX2-NEXT: sete %al
				; X64-AVX2-NEXT: vzeroupper
				; X64-AVX2-NEXT: retq
	%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind			%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
	%c = icmp eq i32 %m, 0			%c = icmp eq i32 %m, 0
	ret i1 %c			ret i1 %c
	}			}

	; This checks that we do not do stupid things with huge sizes.			; This checks that we do not do stupid things with huge sizes.
	define i32 @huge_length(i8* %X, i8* %Y) nounwind {			define i32 @huge_length(i8* %X, i8* %Y) nounwind {
	; X86-LABEL: huge_length:			; X86-LABEL: huge_length:
	Show All 18 Lines

test/Transforms/CodeGenPrepare/X86/memcmp.ll

	Show First 20 Lines • Show All 747 Lines • ▼ Show 20 Lines
	define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) {			define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; X32-LABEL: @cmp_eq16(			; X32-LABEL: @cmp_eq16(
	; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 16)			; X32-NEXT: [[CALL:%.]] = tail call i32 @memcmp(i8 [[X:%.]], i8 [[Y:%.*]], i64 16)
	; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0			; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
	; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X32-NEXT: ret i32 [[CONV]]			; X32-NEXT: ret i32 [[CONV]]
	;			;
	; X64-LABEL: @cmp_eq16(			; X64-LABEL: @cmp_eq16(
	; X64-NEXT: loadbb:			; X64-NEXT: [[TMP1:%.]] = bitcast i8 [[X:%.]] to i128
	; X64-NEXT: [[TMP0:%.]] = bitcast i8 [[X:%.]] to i64			; X64-NEXT: [[TMP2:%.]] = bitcast i8 [[Y:%.]] to i128
	; X64-NEXT: [[TMP1:%.]] = bitcast i8 [[Y:%.]] to i64			; X64-NEXT: [[TMP3:%.]] = load i128, i128 [[TMP1]]
	; X64-NEXT: [[TMP2:%.]] = load i64, i64 [[TMP0]]			; X64-NEXT: [[TMP4:%.]] = load i128, i128 [[TMP2]]
	; X64-NEXT: [[TMP3:%.]] = load i64, i64 [[TMP1]]			; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]]
	; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]			; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
	; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.]], label [[LOADBB1:%.]]			; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
	; X64: res_block:
	; X64-NEXT: br label [[ENDBLOCK:%.*]]
	; X64: loadbb1:
	; X64-NEXT: [[TMP5:%.]] = bitcast i8 [[X]] to i64*
	; X64-NEXT: [[TMP6:%.]] = bitcast i8 [[Y]] to i64*
	; X64-NEXT: [[TMP7:%.]] = getelementptr i64, i64 [[TMP5]], i64 1
	; X64-NEXT: [[TMP8:%.]] = getelementptr i64, i64 [[TMP6]], i64 1
	; X64-NEXT: [[TMP9:%.]] = load i64, i64 [[TMP7]]
	; X64-NEXT: [[TMP10:%.]] = load i64, i64 [[TMP8]]
	; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]]
	; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
	; X64: endblock:
	; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
	; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
	; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32			; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
	; X64-NEXT: ret i32 [[CONV]]			; X64-NEXT: ret i32 [[CONV]]
	;			;
	%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)			%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
	%cmp = icmp eq i32 %call, 0			%cmp = icmp eq i32 %call, 0
	%conv = zext i1 %cmp to i32			%conv = zext i1 %cmp to i32
	ret i32 %conv			ret i32 %conv
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][ExpandMemcmp] Allow memcmp to expand to vector loads (2).
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 120814

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

lib/Analysis/TargetTransformInfo.cpp

lib/CodeGen/CodeGenPrepare.cpp

lib/Target/PowerPC/PPCTargetTransformInfo.h

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

lib/Target/X86/X86TargetTransformInfo.h

lib/Target/X86/X86TargetTransformInfo.cpp

lib/Transforms/Scalar/MergeICmps.cpp

test/CodeGen/X86/memcmp-optsize.ll

test/CodeGen/X86/memcmp.ll

test/Transforms/CodeGenPrepare/X86/memcmp.ll

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][ExpandMemcmp] Allow memcmp to expand to vector loads (2).ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 120814

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

lib/Analysis/TargetTransformInfo.cpp

lib/CodeGen/CodeGenPrepare.cpp

lib/Target/PowerPC/PPCTargetTransformInfo.h

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

lib/Target/X86/X86TargetTransformInfo.h

lib/Target/X86/X86TargetTransformInfo.cpp

lib/Transforms/Scalar/MergeICmps.cpp

test/CodeGen/X86/memcmp-optsize.ll

test/CodeGen/X86/memcmp.ll

test/Transforms/CodeGenPrepare/X86/memcmp.ll

[CodeGen][ExpandMemcmp] Allow memcmp to expand to vector loads (2).
ClosedPublic