This is an archive of the discontinued LLVM Phabricator instance.

[TargetLowering][X86][AMDGPU] Teach expandMUL_LOHI to handle a mix of sign and zero extend.
AbandonedPublic

Authored by craig.topper on Sep 5 2022, 12:45 AM.

Download Raw Diff

Details

Reviewers

RKSimon
efriedma
spatel

Summary

If one input of a MUL that needs to expanded is sign extended and
the other is zero extended, we can use an unsigned mul and apply
a correction. If the signed number is negative, we subtract the lower
bits of the zero extended input.

Fixes PR57549, but not in the same way.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

craig.topper created this revision.Sep 5 2022, 12:45 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 5 2022, 12:45 AM

Herald added subscribers: kosarev, StephenFan, kerbowa and 8 others. · View Herald Transcript

craig.topper requested review of this revision.Sep 5 2022, 12:45 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 5 2022, 12:45 AM

Herald added a subscriber: wdng. · View Herald Transcript

Harbormaster completed remote builds in B185042: Diff 457895.Sep 5 2022, 12:46 AM

Should we have a generic combine instead?

If we have something like this - https://alive2.llvm.org/ce/z/NAmVa3 - then it's always better to get rid of the mul unless we have 'minsize':

define i32 @src(i32 %x, i32 %y) {
  %ysign = ashr i32 %y, 31
  %m = mul i32 %x, %ysign
  ret i32 %m
}

define i32 @tgt(i32 %x, i32 %y) {
  %isneg = icmp slt i32 %y, 0
  %negx = sub i32 0, %x
  %m = select i1 %isneg, i32 %negx, i32 0
  ret i32 %m
}

% llc -o - mul.ll -mtriple=riscv32 -mattr=m
	srai	a1, a1, 31
	mul	a0, a0, a1
...
	neg	a0, a0
	srai	a1, a1, 31
	and	a0, a1, a0

% llc -o - mul.ll -mtriple=aarch64         
	asr	w8, w1, #31
	mul	w0, w0, w8
...
	neg	w8, w0
	and	w0, w8, w1, asr #31

% llc -o - mul.ll -mtriple=x86_64 
	movl	%esi, %eax
	sarl	$31, %eax
	imull	%edi, %eax
...
	movl	%esi, %eax
	negl	%edi
	sarl	$31, %eax
	andl	%edi, %eax

In D133282#3770947, @spatel wrote:

Should we have a generic combine instead?

If we have something like this - https://alive2.llvm.org/ce/z/NAmVa3 - then it's always better to get rid of the mul unless we have 'minsize':

define i32 @src(i32 %x, i32 %y) {
  %ysign = ashr i32 %y, 31
  %m = mul i32 %x, %ysign
  ret i32 %m
}

define i32 @tgt(i32 %x, i32 %y) {
  %isneg = icmp slt i32 %y, 0
  %negx = sub i32 0, %x
  %m = select i1 %isneg, i32 %negx, i32 0
  ret i32 %m
}

% llc -o - mul.ll -mtriple=riscv32 -mattr=m
	srai	a1, a1, 31
	mul	a0, a0, a1
...
	neg	a0, a0
	srai	a1, a1, 31
	and	a0, a1, a0

% llc -o - mul.ll -mtriple=aarch64         
	asr	w8, w1, #31
	mul	w0, w0, w8
...
	neg	w8, w0
	and	w0, w8, w1, asr #31

% llc -o - mul.ll -mtriple=x86_64 
	movl	%esi, %eax
	sarl	$31, %eax
	imull	%edi, %eax
...
	movl	%esi, %eax
	negl	%edi
	sarl	$31, %eax
	andl	%edi, %eax

Would we do a second combine for (add Z, (and X, (neg Y)) to get the (sub Z, (and X, Y))?

In D133282#3770953, @craig.topper wrote:

Would we do a second combine for (add Z, (and X, (neg Y)) to get the (sub Z, (and X, Y))?

I didn't look past the first missing transform...but sure, if that's missing too. :)

Something like this?
https://alive2.llvm.org/ce/z/zFxp6A

Seems to be missed in IR too.

spatel mentioned this in D133369: [InstCombine] fold add+negate through select into sub.Sep 6 2022, 11:28 AM

spatel mentioned this in rG7c57180900fb: [InstCombine] fold add+negate through select into sub.Sep 7 2022, 5:23 AM

Replaced by D133399

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

TargetLowering.cpp

46 lines

test/

CodeGen/

AMDGPU/

mad_64_32.ll

8 lines

X86/

extmul128.ll

11 lines

Diff 457895

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//		//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
		Lint: Lint Inline Actions clang-format not found in user’s local PATH; not linting file. Lint: Lint: clang-format not found in user’s local PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 7,001 Lines • ▼ Show 20 Lines	if (!LL.getNode() && !RL.getNode() &&
LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);		LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);
RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RHS);		RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RHS);
}		}

if (!LL.getNode())		if (!LL.getNode())
return false;		return false;

APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);		APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
if (DAG.MaskedValueIsZero(LHS, HighMask) &&		bool LHSIsZ = DAG.MaskedValueIsZero(LHS, HighMask);
DAG.MaskedValueIsZero(RHS, HighMask)) {		bool RHSIsZ = DAG.MaskedValueIsZero(RHS, HighMask);

		if (LHSIsZ && RHSIsZ) {
// The inputs are both zero-extended.		// The inputs are both zero-extended.
if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {		if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
Result.push_back(Lo);		Result.push_back(Lo);
Result.push_back(Hi);		Result.push_back(Hi);
if (Opcode != ISD::MUL) {		if (Opcode != ISD::MUL) {
SDValue Zero = DAG.getConstant(0, dl, HiLoVT);		SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
Result.push_back(Zero);		Result.push_back(Zero);
Result.push_back(Zero);		Result.push_back(Zero);
}		}
return true;		return true;
}		}
}		}

if (!VT.isVector() && Opcode == ISD::MUL &&		if (!VT.isVector() && Opcode == ISD::MUL) {
DAG.ComputeMaxSignificantBits(LHS) <= InnerBitSize &&		bool LHSIsS = DAG.ComputeMaxSignificantBits(LHS) <= InnerBitSize;
DAG.ComputeMaxSignificantBits(RHS) <= InnerBitSize) {		bool RHSIsS = DAG.ComputeMaxSignificantBits(RHS) <= InnerBitSize;
		if (LHSIsS && RHSIsS) {
// The input values are both sign-extended.		// The input values are both sign-extended.
// TODO non-MUL case?		// TODO non-MUL case?
if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {		if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
Result.push_back(Lo);		Result.push_back(Lo);
Result.push_back(Hi);		Result.push_back(Hi);
return true;		return true;
}		}
}		}
		if ((LHSIsZ && RHSIsS) \|\| (LHSIsS && RHSIsZ)) {
		// One input is sign-extended and the other is zero-extended.
		// Use a mulhu with a correction.
		if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
		// Canonicalize sign-extended value to LL and zero-extended value to RL.
		if (LHSIsZ)
		std::swap(LL, RL);
		// High bits of LHS are 0 or -1. We need to multiply this by RL which
		// will give 0 or -RL. This needs to be added to Hi. This can be done
		// as:
		// Hi -= LL < 0 ? RL : 0;
		LL = DAG.getNode(ISD::SRA, dl, HiLoVT, LL,
		DAG.getShiftAmountConstant(InnerBitSize - 1, HiLoVT, dl));
		LL = DAG.getNode(ISD::AND, dl, HiLoVT, LL, RL);
		Hi = DAG.getNode(ISD::SUB, dl, HiLoVT, Hi, LL);
		Result.push_back(Lo);
		Result.push_back(Hi);
		return true;
		}
		}
		}

unsigned ShiftAmount = OuterBitSize - InnerBitSize;		unsigned ShiftAmount = OuterBitSize - InnerBitSize;
SDValue Shift = DAG.getShiftAmountConstant(ShiftAmount, VT, dl);		SDValue Shift = DAG.getShiftAmountConstant(ShiftAmount, VT, dl);

if (!LH.getNode() && !RH.getNode() &&		if (!LH.getNode() && !RH.getNode() &&
isOperationLegalOrCustom(ISD::SRL, VT) &&		isOperationLegalOrCustom(ISD::SRL, VT) &&
isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {		isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
LH = DAG.getNode(ISD::SRL, dl, VT, LHS, Shift);		LH = DAG.getNode(ISD::SRL, dl, VT, LHS, Shift);
▲ Show 20 Lines • Show All 2,882 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/mad_64_32.ll

	Show First 20 Lines • Show All 369 Lines • ▼ Show 20 Lines
	; CI-NEXT: v_mul_lo_u32 v4, v4, v1			; CI-NEXT: v_mul_lo_u32 v4, v4, v1
	; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]			; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
	; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1			; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
	; CI-NEXT: s_setpc_b64 s[30:31]			; CI-NEXT: s_setpc_b64 s[30:31]
	;			;
	; SI-LABEL: mad_i64_i32_extops_i32_i64:			; SI-LABEL: mad_i64_i32_extops_i32_i64:
	; SI: ; %bb.0:			; SI: ; %bb.0:
	; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v0			; SI-NEXT: v_mul_hi_u32 v4, v0, v1
	; SI-NEXT: v_mul_hi_u32 v5, v0, v1			; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0
	; SI-NEXT: v_mul_lo_u32 v4, v4, v1
	; SI-NEXT: v_mul_lo_u32 v0, v0, v1			; SI-NEXT: v_mul_lo_u32 v0, v0, v1
	; SI-NEXT: v_add_i32_e32 v1, vcc, v5, v4			; SI-NEXT: v_and_b32_e32 v5, v5, v1
				; SI-NEXT: v_sub_i32_e32 v1, vcc, v4, v5
	; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2			; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
	; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc			; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
	; SI-NEXT: s_setpc_b64 s[30:31]			; SI-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX9-LABEL: mad_i64_i32_extops_i32_i64:			; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v4, v1			; GFX9-NEXT: v_mov_b32_e32 v4, v1
	▲ Show 20 Lines • Show All 550 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/extmul128.ll

Show All 23 Lines	; CHECK-NEXT: retq
ret i128 %cc		ret i128 %cc
}		}
define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {		define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {
; CHECK-LABEL: i64_zext_sext_i128:		; CHECK-LABEL: i64_zext_sext_i128:
; CHECK: # %bb.0:		; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax		; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %rsi		; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: sarq $63, %rsi		; CHECK-NEXT: sarq $63, %rsi
; CHECK-NEXT: imulq %rdi, %rsi		; CHECK-NEXT: andq %rdi, %rsi
; CHECK-NEXT: addq %rsi, %rdx		; CHECK-NEXT: subq %rsi, %rdx
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%aa = zext i64 %a to i128		%aa = zext i64 %a to i128
%bb = sext i64 %b to i128		%bb = sext i64 %b to i128
%cc = mul i128 %aa, %bb		%cc = mul i128 %aa, %bb
ret i128 %cc		ret i128 %cc
}		}

define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {		define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
; CHECK-LABEL: i64_sext_zext_i128:		; CHECK-LABEL: i64_sext_zext_i128:
; CHECK: # %bb.0:		; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax		; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: mulq %rsi		; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: imulq %rsi, %rcx		; CHECK-NEXT: sarq $63, %rdi
; CHECK-NEXT: addq %rcx, %rdx		; CHECK-NEXT: andq %rsi, %rdi
		; CHECK-NEXT: subq %rdi, %rdx
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%aa = sext i64 %a to i128		%aa = sext i64 %a to i128
%bb = zext i64 %b to i128		%bb = zext i64 %b to i128
%cc = mul i128 %aa, %bb		%cc = mul i128 %aa, %bb
ret i128 %cc		ret i128 %cc
}		}