This is an archive of the discontinued LLVM Phabricator instance.

[SystemZ::TTI] Return zero cost for a load / store connected with a scalar bswap
ClosedPublic

Authored by jonpa on Nov 24 2018, 7:52 AM.

Download Raw Diff

Details

Reviewers

Summary

Since byte-swapping loads and stores are supported, a loop containing a load -> bswap or bswap -> store should have the cost reduced by 1 for each such pair.

Since the Instruction pointer is available in getMemoryOpCost() this is the place this search is done to detect these cases. Perhaps the 0 cost should have belonged to the bswap intrinsic, but it is not possible to handle both cases in getIntrinsicInstrCost() as only the arguments are available.

This is NFC on SPEC while ~20 loops get their scalar costs corrected without affecting any vectorizer decisions.

Diff Detail

Event Timeline

jonpa created this revision.Nov 24 2018, 7:52 AM

LGTM, thanks!

This revision is now accepted and ready to land.Nov 27 2018, 11:13 AM

r347732

Revision Contents

Path

Size

lib/

Target/

SystemZ/

SystemZTargetTransformInfo.cpp

25 lines

test/

Analysis/

CostModel/

SystemZ/

intrinsics.ll

67 lines

Diff 175160

lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Show First 20 Lines • Show All 933 Lines • ▼ Show 20 Lines	case Instruction::ICmp:

unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);		unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);		return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);
break;		break;
}		}
return false;		return false;
}		}

		static bool isBswapIntrinsicCall(const Value *V) {
		if (const Instruction *I = dyn_cast<Instruction>(V))
		if (auto *CI = dyn_cast<CallInst>(I))
		if (auto *F = CI->getCalledFunction())
		if (F->getIntrinsicID() == Intrinsic::bswap)
		return true;
		return false;
		}

int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,		int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment, unsigned AddressSpace,		unsigned Alignment, unsigned AddressSpace,
const Instruction *I) {		const Instruction *I) {
assert(!Src->isVoidTy() && "Invalid type");		assert(!Src->isVoidTy() && "Invalid type");

if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {		if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
// Store the load or its truncated or extended value in FoldedValue.		// Store the load or its truncated or extended value in FoldedValue.
const Instruction *FoldedValue = nullptr;		const Instruction *FoldedValue = nullptr;
Show All 20 Lines	if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {

return 0; // Only I is foldable in user.		return 0; // Only I is foldable in user.
}		}
}		}

unsigned NumOps =		unsigned NumOps =
(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));		(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));

		// Store/Load reversed saves one instruction.
		if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) {
		if (Opcode == Instruction::Load && I->hasOneUse()) {
		const Instruction LdUser = cast<Instruction>(I->user_begin());
		// In case of load -> bswap -> store, return normal cost for the load.
		if (isBswapIntrinsicCall(LdUser) &&
		(!LdUser->hasOneUse() \|\| !isa<StoreInst>(*LdUser->user_begin())))
		return 0;
		}
		else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
		const Value *StoredVal = SI->getValueOperand();
		if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
		return 0;
		}
		}

if (Src->getScalarSizeInBits() == 128)		if (Src->getScalarSizeInBits() == 128)
// 128 bit scalars are held in a pair of two 64 bit registers.		// 128 bit scalars are held in a pair of two 64 bit registers.
NumOps *= 2;		NumOps *= 2;

return NumOps;		return NumOps;
}		}

// The generic implementation of getInterleavedMemoryOpCost() is based on		// The generic implementation of getInterleavedMemoryOpCost() is based on
▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

test/Analysis/CostModel/SystemZ/intrinsics.ll

Show All 34 Lines	; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef)
%swp1 = tail call i16 @llvm.bswap.i16(i16 %arg)		%swp1 = tail call i16 @llvm.bswap.i16(i16 %arg)
%swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2)		%swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2)
%swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4)		%swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4)
%swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8)		%swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8)
%swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef)		%swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef)
ret void		ret void
}		}

		; Test that store/load reversed is reflected in costs.
		define void @bswap_i64_mem(i64* %src, i64 %arg, i64* %dst) {
		; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64_mem':
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i64, i64* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1)
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp2, i64* %dst
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i64, i64* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp3, i64* %dst
		%Ld1 = load i64, i64* %src
		%swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1)

		%swp2 = tail call i64 @llvm.bswap.i64(i64 %arg)
		store i64 %swp2, i64* %dst

		%Ld2 = load i64, i64* %src
		%swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2)
		store i64 %swp3, i64* %dst

		ret void
		}

		define void @bswap_i32_mem(i32* %src, i32 %arg, i32* %dst) {
		; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32_mem':
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i32, i32* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1)
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp2, i32* %dst
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i32, i32* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp3, i32* %dst
		%Ld1 = load i32, i32* %src
		%swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1)

		%swp2 = tail call i32 @llvm.bswap.i32(i32 %arg)
		store i32 %swp2, i32* %dst

		%Ld2 = load i32, i32* %src
		%swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2)
		store i32 %swp3, i32* %dst

		ret void
		}

		define void @bswap_i16_mem(i16* %src, i16 %arg, i16* %dst) {
		; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16_mem':
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i16, i16* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1)
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp2, i16* %dst
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i16, i16* %src
		; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2)
		; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp3, i16* %dst
		%Ld1 = load i16, i16* %src
		%swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1)

		%swp2 = tail call i16 @llvm.bswap.i16(i16 %arg)
		store i16 %swp2, i16* %dst

		%Ld2 = load i16, i16* %src
		%swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2)
		store i16 %swp3, i16* %dst

		ret void
		}


declare i64 @llvm.bswap.i64(i64)		declare i64 @llvm.bswap.i64(i64)
declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)		declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)		declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)

declare i32 @llvm.bswap.i32(i32)		declare i32 @llvm.bswap.i32(i32)
declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)		declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)		declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)		declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)

declare i16 @llvm.bswap.i16(i16)		declare i16 @llvm.bswap.i16(i16)
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)		declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)		declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)		declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)		declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)