This is an archive of the discontinued LLVM Phabricator instance.

[SystemZ::TTI] Improve costs for add, sub and mul i16 against memory
ClosedPublic

Authored by jonpa on Nov 27 2018, 2:27 AM.

Download Raw Diff

Details

Reviewers

Summary

AH, SH and MH costs are already covered in the cases where LHS is 32 bits and RHS is 16 bits of memory sign-extended to i32.

As these instructions are also used when LHS is i16, this patch handles this case also by recognizing that the loads in those cases also get folded.

This is NFC on SPEC, but silently affects the scalar loop cost estimates (in LoopVectorizer) of 26 times.

I'm not 100% sure about the implications of LHS being just 16 bits, but this seems to at least match what CodeGen is doing.

Diff Detail

Event Timeline

jonpa created this revision.Nov 27 2018, 2:27 AM

LGTM, thanks!

This revision is now accepted and ready to land.Nov 27 2018, 11:18 AM

r347734

Revision Contents

Path

Size

lib/

Target/

SystemZ/

SystemZTargetTransformInfo.cpp

10 lines

test/

Analysis/

CostModel/

SystemZ/

memop-folding-int-arith.ll

93 lines

Diff 175433

lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Show First 20 Lines • Show All 893 Lines • ▼ Show 20 Lines	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
UserI = cast<Instruction>(*UserI->user_begin());		UserI = cast<Instruction>(*UserI->user_begin());
// Load (single use) -> trunc/extend (single use) -> UserI		// Load (single use) -> trunc/extend (single use) -> UserI
}		}
if ((UserI->getOpcode() == Instruction::Sub \|\|		if ((UserI->getOpcode() == Instruction::Sub \|\|
UserI->getOpcode() == Instruction::SDiv \|\|		UserI->getOpcode() == Instruction::SDiv \|\|
UserI->getOpcode() == Instruction::UDiv) &&		UserI->getOpcode() == Instruction::UDiv) &&
UserI->getOperand(1) != FoldedValue)		UserI->getOperand(1) != FoldedValue)
return false; // Not commutative, only RHS foldable.		return false; // Not commutative, only RHS foldable.
		// LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
		// extension was made of the load.
		unsigned LoadOrTruncBits =
		((SExtBits \|\| ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
switch (UserI->getOpcode()) {		switch (UserI->getOpcode()) {
case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64		case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
case Instruction::Sub:		case Instruction::Sub:
if (LoadedBits == 32 && ZExtBits == 64)		if (LoadedBits == 32 && ZExtBits == 64)
return true;		return true;
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64		case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
if (LoadedBits == 16 &&		if (LoadedBits == 16 &&
(SExtBits == 32 \|\|		(SExtBits == 32 \|\|
(SExtBits == 64 && ST->hasMiscellaneousExtensions2())))		(SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
return true;		return true;
		if (LoadOrTruncBits == 16)
		return true;
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
case Instruction::SDiv:// SE: 32->64		case Instruction::SDiv:// SE: 32->64
if (LoadedBits == 32 && SExtBits == 64)		if (LoadedBits == 32 && SExtBits == 64)
return true;		return true;
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
case Instruction::UDiv:		case Instruction::UDiv:
case Instruction::And:		case Instruction::And:
case Instruction::Or:		case Instruction::Or:
case Instruction::Xor:		case Instruction::Xor:
case Instruction::ICmp:		case Instruction::ICmp:
// This also makes sense for float operations, but disabled for now due		// This also makes sense for float operations, but disabled for now due
// to regressions.		// to regressions.
// case Instruction::FCmp:		// case Instruction::FCmp:
// case Instruction::FAdd:		// case Instruction::FAdd:
// case Instruction::FSub:		// case Instruction::FSub:
// case Instruction::FMul:		// case Instruction::FMul:
// case Instruction::FDiv:		// case Instruction::FDiv:

// All possible extensions of memory checked above.		// All possible extensions of memory checked above.
if (SExtBits \|\| ZExtBits)
return false;

unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);		return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);
break;		break;
}		}
return false;		return false;
}		}

int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,		int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment, unsigned AddressSpace,		unsigned Alignment, unsigned AddressSpace,
▲ Show 20 Lines • Show All 132 Lines • Show Last 20 Lines

test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll

	Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i32 %li32_3 to i64			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i32 %li32_3 to i64
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %9 = add i64 %zext_0, undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %9 = add i64 %zext_0, undef
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = add i32 %sext_3, undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = add i32 %sext_3, undef
	}			}

				define void @add_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
				%L1 = load i16, i16* %Src1
				%S0 = add i16 %L1, %Arg
				store volatile i16 %S0, i16* %Dst

				%L2 = load i16, i16* %Src1
				%L3 = load i16, i16* %Src2
				%S1 = add i16 %L2, %L3
				store volatile i16 %S1, i16* %Dst

				; Truncated load
				%L32 = load i32, i32* %Src32
				%tr = trunc i32 %L32 to i16
				%S2 = add i16 %tr, %Arg
				store volatile i16 %S2, i16* %Dst

				ret void
				; CHECK: Printing analysis 'Cost Model Analysis' for function 'add_i16_mem16':
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S0 = add i16 %L1, %Arg
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S0, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L2 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L3 = load i16, i16* %Src2
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S1 = add i16 %L2, %L3
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S1, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S2 = add i16 %tr, %Arg
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S2, i16* %Dst
				}

	define void @sub_lhs_mem() {			define void @sub_lhs_mem() {
	%li32 = load i32, i32* undef			%li32 = load i32, i32* undef
	sub i32 %li32, undef			sub i32 %li32, undef

	%li32_0 = load i32, i32* undef			%li32_0 = load i32, i32* undef
	%li32_1 = load i32, i32* undef			%li32_1 = load i32, i32* undef
	sub i32 %li32_0, %li32_1			sub i32 %li32_0, %li32_1

	▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i32 %li32_3 to i64			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i32 %li32_3 to i64
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %7 = sub i64 undef, %zext_0			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %7 = sub i64 undef, %zext_0
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %8 = sub i32 undef, %sext_3			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %8 = sub i32 undef, %sext_3
	}			}

				define void @sub_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
				%L1 = load i16, i16* %Src1
				%D0 = sub i16 %Arg, %L1
				store volatile i16 %D0, i16* %Dst

				%L2 = load i16, i16* %Src1
				%L3 = load i16, i16* %Src2
				%D1 = sub i16 %L2, %L3
				store volatile i16 %D1, i16* %Dst

				; Truncated load
				%L32 = load i32, i32* %Src32
				%tr = trunc i32 %L32 to i16
				%D2 = sub i16 %Arg, %tr
				store volatile i16 %D2, i16* %Dst

				ret void
				; CHECK: Printing analysis 'Cost Model Analysis' for function 'sub_i16_mem16':
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D0 = sub i16 %Arg, %L1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D0, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L2 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L3 = load i16, i16* %Src2
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sub i16 %L2, %L3
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D1, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sub i16 %Arg, %tr
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D2, i16* %Dst
				}

	define void @mul() {			define void @mul() {
	%li32 = load i32, i32* undef			%li32 = load i32, i32* undef
	mul i32 %li32, undef			mul i32 %li32, undef

	%li32_0 = load i32, i32* undef			%li32_0 = load i32, i32* undef
	%li32_1 = load i32, i32* undef			%li32_1 = load i32, i32* undef
	mul i32 %li32_0, %li32_1			mul i32 %li32_0, %li32_1

	▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i16 %li16_2 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext_0 = zext i16 %li16_2 to i32
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %9 = mul i32 %zext_0, undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %9 = mul i32 %zext_0, undef
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_3 = load i16, i16* undef
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_3 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32			; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_4 = sext i16 %li16_3 to i32
	; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = mul i32 %sext_3, undef			; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = mul i32 %sext_3, undef
	}			}

				define void @mul_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
				%L1 = load i16, i16* %Src1
				%P0 = mul i16 %Arg, %L1
				store volatile i16 %P0, i16* %Dst

				%L2 = load i16, i16* %Src1
				%L3 = load i16, i16* %Src2
				%P1 = mul i16 %L2, %L3
				store volatile i16 %P1, i16* %Dst

				; Truncated load
				%L32 = load i32, i32* %Src32
				%tr = trunc i32 %L32 to i16
				%P2 = mul i16 %Arg, %tr
				store volatile i16 %P2, i16* %Dst

				ret void
				; CHECK: Printing analysis 'Cost Model Analysis' for function 'mul_i16_mem16':
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P0 = mul i16 %Arg, %L1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P0, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L2 = load i16, i16* %Src1
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L3 = load i16, i16* %Src2
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P1 = mul i16 %L2, %L3
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P1, i16* %Dst
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32
				; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P2 = mul i16 %Arg, %tr
				; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P2, i16* %Dst
				}

	define void @sdiv_lhs(i32 %arg32, i64 %arg64) {			define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
	%li32 = load i32, i32* undef			%li32 = load i32, i32* undef
	sdiv i32 %li32, %arg32			sdiv i32 %li32, %arg32

	%li32_0 = load i32, i32* undef			%li32_0 = load i32, i32* undef
	%li32_1 = load i32, i32* undef			%li32_1 = load i32, i32* undef
	sdiv i32 %li32_0, %li32_1			sdiv i32 %li32_0, %li32_1

	▲ Show 20 Lines • Show All 343 Lines • Show Last 20 Lines