Diff 398573

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//		//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
		Lint: Lint Inline Actions clang-format not found in user’s local PATH; not linting file. Lint: Lint: clang-format not found in user’s local PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 8,934 Lines • ▼ Show 20 Lines	if (N1C && !N1C->isOpaque())
if (SDValue NewSRA = visitShiftByConstant(N))		if (SDValue NewSRA = visitShiftByConstant(N))
return NewSRA;		return NewSRA;

// Try to transform this shift into a multiply-high if		// Try to transform this shift into a multiply-high if
// it matches the appropriate pattern detected in combineShiftToMULH.		// it matches the appropriate pattern detected in combineShiftToMULH.
if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))		if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
return MULH;		return MULH;

		// Attempt to convert a sra of a load into a narrower sign-extending load.
		if (SDValue NarrowLoad = ReduceLoadWidth(N))
		return NarrowLoad;

return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::visitSRL(SDNode *N) {		SDValue DAGCombiner::visitSRL(SDNode *N) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
if (SDValue V = DAG.simplifyShift(N0, N1))		if (SDValue V = DAG.simplifyShift(N0, N1))
return V;		return V;
▲ Show 20 Lines • Show All 3,156 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {

unsigned ShAmt = 0;		unsigned ShAmt = 0;
bool HasShiftedOffset = false;		bool HasShiftedOffset = false;
// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then		// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
// extended to VT.		// extended to VT.
if (Opc == ISD::SIGN_EXTEND_INREG) {		if (Opc == ISD::SIGN_EXTEND_INREG) {
ExtType = ISD::SEXTLOAD;		ExtType = ISD::SEXTLOAD;
ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();		ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
		} else if (Opc == ISD::SRA) {
		// Another special-case: SRA is basically sign-extending a narrower value,
		// or it may be shifting a higher subword, half or byte into the lowest
		// bits.
		SDValue N1 = N->getOperand(1);
		// Only handle shift with constant shift amount, and the shiftee must be a
		// non-zextload load.
		auto *LN0 = dyn_cast<LoadSDNode>(N0);
		auto *N1C = dyn_cast<ConstantSDNode>(N1);
		if (!N1C \|\| !LN0)
		return SDValue();
		if (LN0->getExtensionType() == ISD::ZEXTLOAD)
		return SDValue();
		// If the shift amount is larger than the memory type then we're not
		// accessing any of the loaded bytes.
		ShAmt = N1C->getZExtValue();
		spatelUnsubmitted Not Done Reply Inline Actions The various shift amount variables are difficult to follow. This is really a question for the existing code - can we make it clearer (either through renaming or code comments) how "ShAmt", "ShiftAmt" and "ShLeftAmt" are different? If we can improve that, is it possible to make a small helper/lambda, so we can share the bailout conditions for SRA/SRL instead of duplicating code? spatel: The various shift amount variables are difficult to follow. This is really a question for the…
		bjopeAuthorUnsubmitted Done Reply Inline Actions Yes. I thought the old names were kind of confusing. So I added the SRA separately. I'll make a separate patch, trying to improve the existing code as well. One reason I did not share things with SRL (I started off trying to do it that way) is that the solution for SRL seemed to be divided into two parts. I believe the second part (that includes the hasOneUse guard) can be triggered also as being nestled inside an AND (N pointing at the AND and N0 being the SRL). Although, we can probably do some code sharing for the first part regardless of that, at least now when I understand that the unexplained "N0 = SDValue(N, 0)" only should be done only for SRL and not for SRA. bjope: Yes. I thought the old names were kind of confusing. So I added the SRA separately. I'll make a…
		uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits();
		if (MemoryWidth <= ShAmt)
		return SDValue();
		// Attempt to fold away the SRA by using SEXTLOAD.
		ExtType = ISD::SEXTLOAD;
		ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
} else if (Opc == ISD::SRL) {		} else if (Opc == ISD::SRL) {
// Another special-case: SRL is basically zero-extending a narrower value,		// Another special-case: SRL is basically zero-extending a narrower value,
// or it maybe shifting a higher subword, half or byte into the lowest		// or it maybe shifting a higher subword, half or byte into the lowest
// bits.		// bits.
ExtType = ISD::ZEXTLOAD;		ExtType = ISD::ZEXTLOAD;
N0 = SDValue(N, 0);		N0 = SDValue(N, 0);

auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));		auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
Show All 25 Lines	if (Mask.isMask()) {
HasShiftedOffset = true;		HasShiftedOffset = true;
} else		} else
return SDValue();		return SDValue();

ExtType = ISD::ZEXTLOAD;		ExtType = ISD::ZEXTLOAD;
ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);		ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
}		}

		// FIXME: Investigate/describe why we limit this to hasOneUse (it seems a bit
		// limiting for the case when N==N0, i.e. when being called from visitSRL).
if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {		if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
SDValue SRL = N0;		SDValue SRL = N0;
if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {		if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
ShAmt = ConstShift->getZExtValue();		ShAmt = ConstShift->getZExtValue();
unsigned EVTBits = ExtVT.getScalarSizeInBits();		unsigned EVTBits = ExtVT.getScalarSizeInBits();
// Is the shift amount a multiple of size of VT?		// Is the shift amount a multiple of size of VT?
if ((ShAmt & (EVTBits-1)) == 0) {		if ((ShAmt & (EVTBits-1)) == 0) {
N0 = N0.getOperand(0);		N0 = N0.getOperand(0);
// Is the load width a multiple of size of VT?		// Is the load width a multiple of size of VT?
if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0)		if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0)
return SDValue();		return SDValue();
}		}

// At this point, we must have a load or else we can't do the transform.		// At this point, we must have a load or else we can't do the transform.
auto *LN0 = dyn_cast<LoadSDNode>(N0);		auto *LN0 = dyn_cast<LoadSDNode>(N0);
if (!LN0) return SDValue();		if (!LN0) return SDValue();

// Because a SRL must be assumed to need to zero-extend the high bits		// Because a SRL must be assumed to need to zero-extend the high bits
// (as opposed to anyext the high bits), we can't combine the zextload		// (as opposed to anyext the high bits), we can't combine the zextload
// lowering of SRL and an sextload.		// lowering of SRL and an sextload. Similarly for SRA
if (LN0->getExtensionType() == ISD::SEXTLOAD)		if (LN0->getExtensionType() == ISD::SEXTLOAD)
return SDValue();		return SDValue();

// If the shift amount is larger than the input type then we're not		// If the shift amount is larger than the input type then we're not
// accessing any of the loaded bytes. If the load was a zextload/extload		// accessing any of the loaded bytes. If the load was a zextload/extload
// then the result of the shift+trunc is zero/undef (handled elsewhere).		// then the result of the shift+trunc is zero/undef (handled elsewhere).
if (ShAmt >= LN0->getMemoryVT().getSizeInBits())		if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 11,838 Lines • Show Last 20 Lines

llvm/test/CodeGen/PowerPC/pr13891.ll

	; RUN: llc -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -verify-machineinstrs < %s \| FileCheck %s
	target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"			target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
	target triple = "powerpc64-unknown-linux-gnu"			target triple = "powerpc64-unknown-linux-gnu"

	%struct.foo = type { i8, i8 }			%struct.foo = type { i8, i8 }

	define void @_Z5check3foos(%struct.foo* nocapture byval(%struct.foo) %f, i16 signext %i) noinline {			define void @_Z5check3foos(%struct.foo* nocapture byval(%struct.foo) %f, i16 signext %i) noinline {
	; CHECK-LABEL: _Z5check3foos:			; CHECK-LABEL: _Z5check3foos:
	; CHECK: sth 3, {{[0-9]+}}(1)			; CHECK: sth 3, {{[0-9]+}}(1)
	; CHECK: lha {{[0-9]+}}, {{[0-9]+}}(1)			; CHECK: lbz {{[0-9]+}}, {{[0-9]+}}(1)
	entry:			entry:
	%0 = bitcast %struct.foo* %f to i16*			%0 = bitcast %struct.foo* %f to i16*
	%1 = load i16, i16* %0, align 2			%1 = load i16, i16* %0, align 2
	%bf.val.sext = ashr i16 %1, 8			%bf.val.sext = ashr i16 %1, 8
	%cmp = icmp eq i16 %bf.val.sext, %i			%cmp = icmp eq i16 %bf.val.sext, %i
	br i1 %cmp, label %if.end, label %if.then			br i1 %cmp, label %if.end, label %if.then

	if.then: ; preds = %entry			if.then: ; preds = %entry
	Show All 9 Lines

llvm/test/CodeGen/X86/combine-sra-load.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=CHECK			; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=CHECK

	; FIXME: fold (sra (load i32), 16)) -> (sextload i16)			; fold (sra (load i32), 16)) -> (sextload i16)
	define i32 @sra_half(i32* %p) {			define i32 @sra_half(i32* %p) {
	; CHECK-LABEL: sra_half:			; CHECK-LABEL: sra_half:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl (%rdi), %eax			; CHECK-NEXT: movswl 2(%rdi), %eax
	; CHECK-NEXT: sarl $16, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load i32, i32* %p			%load = load i32, i32* %p
	%shift = ashr i32 %load, 16			%shift = ashr i32 %load, 16
	ret i32 %shift			ret i32 %shift
	}			}

	; Vector version not folded.			; Vector version not folded.
	define <4 x i32> @sra_half_vec(<4 x i32>* %p) {			define <4 x i32> @sra_half_vec(<4 x i32>* %p) {
	; CHECK-LABEL: sra_half_vec:			; CHECK-LABEL: sra_half_vec:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movdqa (%rdi), %xmm0			; CHECK-NEXT: movdqa (%rdi), %xmm0
	; CHECK-NEXT: psrad $16, %xmm0			; CHECK-NEXT: psrad $16, %xmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load <4 x i32>, <4 x i32>* %p			%load = load <4 x i32>, <4 x i32>* %p
	%shift = ashr <4 x i32> %load, <i32 16, i32 16, i32 16, i32 16>			%shift = ashr <4 x i32> %load, <i32 16, i32 16, i32 16, i32 16>
	ret <4 x i32> %shift			ret <4 x i32> %shift
	}			}

	; FIXME: fold (sra (load i64), 48)) -> (sextload i16)			; fold (sra (load i64), 48)) -> (sextload i16)
	define i64 @sra_large_shift(i64* %r) {			define i64 @sra_large_shift(i64* %r) {
	; CHECK-LABEL: sra_large_shift:			; CHECK-LABEL: sra_large_shift:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movq (%rdi), %rax			; CHECK-NEXT: movswq 6(%rdi), %rax
	; CHECK-NEXT: sarq $48, %rax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%t0 = load i64, i64* %r			%t0 = load i64, i64* %r
	%conv = ashr i64 %t0, 48			%conv = ashr i64 %t0, 48
	ret i64 %conv			ret i64 %conv
	}			}

	; Negative test, no fold expected.			; Negative test, no fold expected.
	define i32 @sra_small_shift(i32* %p) {			define i32 @sra_small_shift(i32* %p) {
	Show All 14 Lines
	; CHECK-NEXT: movzbl 1(%rdi), %eax			; CHECK-NEXT: movzbl 1(%rdi), %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load i16, i16* %p			%load = load i16, i16* %p
	%zext = zext i16 %load to i32			%zext = zext i16 %load to i32
	%shift = ashr i32 %zext, 8			%shift = ashr i32 %zext, 8
	ret i32 %shift			ret i32 %shift
	}			}

	; FIXME: fold (sra (sextload i16 to i32), 8) -> (sextload i8)			; fold (sra (sextload i16 to i32), 8) -> (sextload i8)
	define i32 @sra_of_sextload(i16* %p) {			define i32 @sra_of_sextload(i16* %p) {
	; CHECK-LABEL: sra_of_sextload:			; CHECK-LABEL: sra_of_sextload:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movswl (%rdi), %eax			; CHECK-NEXT: movsbl 1(%rdi), %eax
	; CHECK-NEXT: sarl $8, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load i16, i16* %p			%load = load i16, i16* %p
	%sext = sext i16 %load to i32			%sext = sext i16 %load to i32
	%shift = ashr i32 %sext, 8			%shift = ashr i32 %sext, 8
	ret i32 %shift			ret i32 %shift
	}			}

	; Negative test. All bits loaded from memory are shifted out, so we can fold			; Negative test. All bits loaded from memory are shifted out, so we can fold
	; away the shift.			; away the shift.
				spatelUnsubmitted Not Done Reply Inline Actions This comment is not accurate. We are replicating (splatting) the sign bit of the loaded i16 across 32-bits, so there's still a shift. In IR, instcombine would transform this into: define i32 @sra_of_sextload_no_fold(i16* %p) { %load = load i16, i16* %p, align 2 %1 = ashr i16 %load, 15 %shift = sext i16 %1 to i32 ret i32 %shift } spatel: This comment is not accurate. We are replicating (splatting) the sign bit of the loaded i16…
				bjopeAuthorUnsubmitted Done Reply Inline Actions Right, the comment is supposed to say "so we can't fold away the shift". bjope: Right, the comment is supposed to say "so we can't fold away the shift".
	define i32 @sra_of_sextload_no_fold(i16* %p) {			define i32 @sra_of_sextload_no_fold(i16* %p) {
	; CHECK-LABEL: sra_of_sextload_no_fold:			; CHECK-LABEL: sra_of_sextload_no_fold:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movswl (%rdi), %eax			; CHECK-NEXT: movswl (%rdi), %eax
	; CHECK-NEXT: sarl $16, %eax			; CHECK-NEXT: sarl $16, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load i16, i16* %p			%load = load i16, i16* %p
	%sext = sext i16 %load to i32			%sext = sext i16 %load to i32
	%shift = ashr i32 %sext, 16			%shift = ashr i32 %sext, 16
	ret i32 %shift			ret i32 %shift
	}			}

	; FIXME: Fold even if SRA has multiple uses.			; Fold even if SRA has multiple uses.
	define i32 @sra_to_sextload_multiple_sra_uses(i32* %p) {			define i32 @sra_to_sextload_multiple_sra_uses(i32* %p) {
	; CHECK-LABEL: sra_to_sextload_multiple_sra_uses:			; CHECK-LABEL: sra_to_sextload_multiple_sra_uses:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl (%rdi), %ecx			; CHECK-NEXT: movswl 2(%rdi), %ecx
	; CHECK-NEXT: sarl $16, %ecx
	; CHECK-NEXT: movl %ecx, %eax			; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: xorl $6, %eax			; CHECK-NEXT: xorl $6, %eax
	; CHECK-NEXT: orl %ecx, %eax			; CHECK-NEXT: orl %ecx, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%load = load i32, i32* %p			%load = load i32, i32* %p
	%shift = ashr i32 %load, 16			%shift = ashr i32 %load, 16
	%use1 = xor i32 %shift, 6			%use1 = xor i32 %shift, 6
	%use2 = or i32 %shift, %use1			%use2 = or i32 %shift, %use1
	ret i32 %use2			ret i32 %use2
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] Fold SRA of a load into a narrower sign-extending load
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 398573

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/test/CodeGen/PowerPC/pr13891.ll

llvm/test/CodeGen/X86/combine-sra-load.ll

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] Fold SRA of a load into a narrower sign-extending loadClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 398573

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/test/CodeGen/PowerPC/pr13891.ll

llvm/test/CodeGen/X86/combine-sra-load.ll

[DAGCombine] Fold SRA of a load into a narrower sign-extending load
ClosedPublic