This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Utils/
-
Transforms/
-
Utils/
-
Local.cpp
-
test/Transforms/
-
Transforms/
-
CodeGenPrepare/X86/
-
X86/
-
bitreverse-recognize.ll
-
InstCombine/
1/1
bitreverse.ll
-
bswap.ll
-
fsh.ll

Differential D158548

Limit bswap and bitreverse matching to legal integer width
Needs ReviewPublic

Authored by vmustya on Aug 22 2023, 1:57 PM.

Download Raw Diff

This revision needs review, but there are no reviewers specified.

Details

Reviewers: None

Summary

The InstCombine pass matched a sequence of and-shl-add instructions into
bitreverse even when the demanded width is illegal for a target machine.
That produces the IR sequences like the following one:

%trunc = trunc i32 %x to i9
%rev = call i9 @llvm.bitreverse.i9(i9 %trunc)
%mask = and i9 %rev, -128
%reverse = zext i9 %mask to i32

The illegal bitreverse intrinsics are being emulated, so this combining
produces inefficient code.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

vmustya created this revision.Aug 22 2023, 1:57 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 22 2023, 1:57 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

vmustya requested review of this revision.Aug 22 2023, 1:57 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 22 2023, 1:57 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

craig.topper added a subscriber: craig.topper.Aug 22 2023, 1:59 PM

craig.topper added inline comments.

llvm/test/Transforms/InstCombine/bitreverse.ll
265	The CHECK lines in this file are generated by a script, update_llc_test_checks.py. We shouldn't use CHECK-NOT in such tests as the script will just overwrite it the next time someone runs.

Harbormaster completed remote builds in B254188: Diff 552495.Aug 22 2023, 3:27 PM

I've updated LIT tests using the utils/update_test_checks.py tool.

Harbormaster completed remote builds in B254378: Diff 552756.Aug 23 2023, 10:48 AM

Can the expansion produce better code than it does now, or does some important information get lost that prevents this?

Is the issue related to weird types like i9, or would this also be a problem for (say) i8?

In D158548#4611202, @nikic wrote:

Can the expansion produce better code than it does now, or does some important information get lost that prevents this?

Is the issue related to weird types like i9, or would this also be a problem for (say) i8?

I've only seen the issue with the weird types like i9. I've added the minimal reproducer as a lit test case:

define i32 @illegal_width(i32 %x) {
  %b0 = and i32 %x, 1
  %b1 = and i32 %x, 2
  %shift1 = mul nuw nsw i32 %b1, 64
  %shift0 = shl nuw nsw i32 %b0, 8
  %reverse = add i32 %shift0, %shift1
  ret i32 %reverse
}

For x86_64 LLVM generates the following assembly:

	rol	di, 8
	mov	eax, edi
	shl	eax, 6
	and	eax, 16384
	shl	edi, 5
	and	edi, 16384
	lea	eax, [rdi + 2*rax]
	shr	eax, 7
	ret

With the patch applied, the assembly looks much better:

	mov	eax, edi
	and	eax, 2
	shl	eax, 6
	and	edi, 1
	shl	edi, 8
	or	eax, edi
	ret

In D158548#4611411, @vmustya wrote:
In D158548#4611202, @nikic wrote:

Can the expansion produce better code than it does now, or does some important information get lost that prevents this?

Is the issue related to weird types like i9, or would this also be a problem for (say) i8?

I've only seen the issue with the weird types like i9. I've added the minimal reproducer as a lit test case:
define i32 @illegal_width(i32 %x) {
  %b0 = and i32 %x, 1
  %b1 = and i32 %x, 2
  %shift1 = mul nuw nsw i32 %b1, 64
  %shift0 = shl nuw nsw i32 %b0, 8
  %reverse = add i32 %shift0, %shift1
  ret i32 %reverse
}
For x86_64 LLVM generates the following assembly:
	rol	di, 8
	mov	eax, edi
	shl	eax, 6
	and	eax, 16384
	shl	edi, 5
	and	edi, 16384
	lea	eax, [rdi + 2*rax]
	shr	eax, 7
	ret
With the patch applied, the assembly looks much better:
	mov	eax, edi
	and	eax, 2
	shl	eax, 6
	and	edi, 1
	shl	edi, 8
	or	eax, edi
	ret

The backend is expanding to i16 reverse sequence followed by a shift. Only 2 bits of that are demanded, but I suspect multiple uses prevents SimplifyDemandedBit from optimizing it.

There only 2 bits involved in the reverse here. Perhaps we shouldn't form bit reverse from something so sparse?

vmustya marked an inline comment as done.Sep 1 2023, 1:29 PM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Utils/

Local.cpp

5 lines

test/

Transforms/

CodeGenPrepare/

X86/

bitreverse-recognize.ll

2 lines

InstCombine/

bitreverse.ll

19 lines

bswap.ll

2 lines

fsh.ll

9 lines

Diff 552756

llvm/lib/Transforms/Utils/Local.cpp

Show First 20 Lines • Show All 3,385 Lines • ▼ Show 20 Lines	if (auto *IVecTy = dyn_cast<VectorType>(ITy))
DemandedTy = VectorType::get(DemandedTy, IVecTy);		DemandedTy = VectorType::get(DemandedTy, IVecTy);
}		}

// Check BitProvenance hasn't found a source larger than the result type.		// Check BitProvenance hasn't found a source larger than the result type.
unsigned DemandedBW = DemandedTy->getScalarSizeInBits();		unsigned DemandedBW = DemandedTy->getScalarSizeInBits();
if (DemandedBW > ITy->getScalarSizeInBits())		if (DemandedBW > ITy->getScalarSizeInBits())
return false;		return false;

		// Check that the bswap/bitreverse will have legal bit width.
		auto &DL = I->getModule()->getDataLayout();
		if (DL.isIllegalInteger(DemandedBW))
		return false;

// Now, is the bit permutation correct for a bswap or a bitreverse? We can		// Now, is the bit permutation correct for a bswap or a bitreverse? We can
// only byteswap values with an even number of bytes.		// only byteswap values with an even number of bytes.
APInt DemandedMask = APInt::getAllOnes(DemandedBW);		APInt DemandedMask = APInt::getAllOnes(DemandedBW);
bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;		bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;
bool OKForBitReverse = MatchBitReversals;		bool OKForBitReverse = MatchBitReversals;
for (unsigned BitIdx = 0;		for (unsigned BitIdx = 0;
(BitIdx < DemandedBW) && (OKForBSwap \|\| OKForBitReverse); ++BitIdx) {		(BitIdx < DemandedBW) && (OKForBSwap \|\| OKForBitReverse); ++BitIdx) {
if (BitProvenance[BitIdx] == BitPart::Unset) {		if (BitProvenance[BitIdx] == BitPart::Unset) {
▲ Show 20 Lines • Show All 197 Lines • Show Last 20 Lines

llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-recognize.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -loop-unroll -codegenprepare -S -mtriple=x86_64-- -mattr=+xop \| FileCheck %s			; RUN: opt < %s -loop-unroll -codegenprepare -S -mtriple=x86_64-- -mattr=+xop \| FileCheck %s

				target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32:64"

	define i32 @bitreverse_i32(i32 %a) {			define i32 @bitreverse_i32(i32 %a) {
	; CHECK-LABEL: @bitreverse_i32(			; CHECK-LABEL: @bitreverse_i32(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[REV:%.]] = call i32 @llvm.bitreverse.i32(i32 [[A:%.]])			; CHECK-NEXT: [[REV:%.]] = call i32 @llvm.bitreverse.i32(i32 [[A:%.]])
	; CHECK-NEXT: ret i32 [[REV]]			; CHECK-NEXT: ret i32 [[REV]]
	;			;
	entry:			entry:
	br label %for.body			br label %for.body
	▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/bitreverse.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -S \| FileCheck %s		; RUN: opt < %s -passes=instcombine -S \| FileCheck %s

target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"		target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32:64"

declare i16 @llvm.bitreverse.i16(i16)		declare i16 @llvm.bitreverse.i16(i16)
declare i32 @llvm.bitreverse.i32(i32)		declare i32 @llvm.bitreverse.i32(i32)
declare i64 @llvm.bitreverse.i64(i64)		declare i64 @llvm.bitreverse.i64(i64)
declare <2 x i8> @llvm.bitreverse.v2i8(<2 x i8>)		declare <2 x i8> @llvm.bitreverse.v2i8(<2 x i8>)
declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>)		declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>)
declare void @use_i32(i32)		declare void @use_i32(i32)
declare void @use_i64(i64)		declare void @use_i64(i64)
▲ Show 20 Lines • Show All 243 Lines • ▼ Show 20 Lines	;
%6 = and i64 %5, 558144		%6 = and i64 %5, 558144
%7 = or i64 %4, %6		%7 = or i64 %4, %6
%8 = mul nuw nsw i64 %7, 65793		%8 = mul nuw nsw i64 %7, 65793
%9 = lshr i64 %8, 16		%9 = lshr i64 %8, 16
%10 = trunc i64 %9 to i8		%10 = trunc i64 %9 to i8
ret i8 %10		ret i8 %10
}		}

		define i32 @illegal_width(i32 %x) {
		; CHECK-LABEL: @illegal_width(
		craig.topperUnsubmitted Done Reply Inline Actions The CHECK lines in this file are generated by a script, update_llc_test_checks.py. We shouldn't use CHECK-NOT in such tests as the script will just overwrite it the next time someone runs. craig.topper: The CHECK lines in this file are generated by a script, update_llc_test_checks.py. We shouldn't…
		; CHECK-NEXT: [[B1:%.]] = shl i32 [[X:%.]], 6
		; CHECK-NEXT: [[SHIFT1:%.*]] = and i32 [[B1]], 128
		; CHECK-NEXT: [[B0:%.*]] = shl i32 [[X]], 8
		; CHECK-NEXT: [[SHIFT0:%.*]] = and i32 [[B0]], 256
		; CHECK-NEXT: [[REVERSE:%.*]] = or i32 [[SHIFT0]], [[SHIFT1]]
		; CHECK-NEXT: ret i32 [[REVERSE]]
		;
		%b0 = and i32 %x, 1
		%b1 = and i32 %x, 2
		%shift1 = mul nuw nsw i32 %b1, 64
		%shift0 = shl nuw nsw i32 %b0, 8
		%reverse = add i32 %shift0, %shift1
		ret i32 %reverse
		}

define i4 @shuf_4bits(<4 x i1> %x) {		define i4 @shuf_4bits(<4 x i1> %x) {
; CHECK-LABEL: @shuf_4bits(		; CHECK-LABEL: @shuf_4bits(
; CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x i1> [[X:%.]] to i4		; CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x i1> [[X:%.]] to i4
; CHECK-NEXT: [[CAST:%.*]] = call i4 @llvm.bitreverse.i4(i4 [[TMP1]])		; CHECK-NEXT: [[CAST:%.*]] = call i4 @llvm.bitreverse.i4(i4 [[TMP1]])
; CHECK-NEXT: ret i4 [[CAST]]		; CHECK-NEXT: ret i4 [[CAST]]
;		;
%bitreverse = shufflevector <4 x i1> %x, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>		%bitreverse = shufflevector <4 x i1> %x, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%cast = bitcast <4 x i1> %bitreverse to i4		%cast = bitcast <4 x i1> %bitreverse to i4
▲ Show 20 Lines • Show All 239 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/bswap.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -passes=instcombine -S \| FileCheck %s			; RUN: opt < %s -passes=instcombine -S \| FileCheck %s

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32:64"

	define i32 @test1(i32 %i) {			define i32 @test1(i32 %i) {
	; CHECK-LABEL: @test1(			; CHECK-LABEL: @test1(
	; CHECK-NEXT: [[T12:%.]] = call i32 @llvm.bswap.i32(i32 [[I:%.]])			; CHECK-NEXT: [[T12:%.]] = call i32 @llvm.bswap.i32(i32 [[I:%.]])
	; CHECK-NEXT: ret i32 [[T12]]			; CHECK-NEXT: ret i32 [[T12]]
	;			;
	%t1 = lshr i32 %i, 24			%t1 = lshr i32 %i, 24
	%t3 = lshr i32 %i, 8			%t3 = lshr i32 %i, 8
	▲ Show 20 Lines • Show All 959 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/fsh.ll

	Show First 20 Lines • Show All 664 Lines • ▼ Show 20 Lines
	;			;
	%t1 = and i32 %a, 4294901760 ; 0xffff0000			%t1 = and i32 %a, 4294901760 ; 0xffff0000
	%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 16)			%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 16)
	ret i32 %t2			ret i32 %t2
	}			}

	define i32 @fshl_mask_args_same2(i32 %a) {			define i32 @fshl_mask_args_same2(i32 %a) {
	; CHECK-LABEL: @fshl_mask_args_same2(			; CHECK-LABEL: @fshl_mask_args_same2(
	; CHECK-NEXT: [[TRUNC:%.]] = trunc i32 [[A:%.]] to i16			; CHECK-NEXT: [[T1:%.]] = shl i32 [[A:%.]], 8
	; CHECK-NEXT: [[REV:%.*]] = shl i16 [[TRUNC]], 8			; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], 65280
	; CHECK-NEXT: [[T2:%.*]] = zext i16 [[REV]] to i32
	; CHECK-NEXT: ret i32 [[T2]]			; CHECK-NEXT: ret i32 [[T2]]
	;			;
	%t1 = and i32 %a, 255			%t1 = and i32 %a, 255
	%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 8)			%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 8)
	ret i32 %t2			ret i32 %t2
	}			}

	define i32 @fshl_mask_args_same3(i32 %a) {			define i32 @fshl_mask_args_same3(i32 %a) {
	; CHECK-LABEL: @fshl_mask_args_same3(			; CHECK-LABEL: @fshl_mask_args_same3(
	; CHECK-NEXT: [[REV:%.]] = shl i32 [[A:%.]], 24			; CHECK-NEXT: [[T2:%.]] = shl i32 [[A:%.]], 24
	; CHECK-NEXT: ret i32 [[REV]]			; CHECK-NEXT: ret i32 [[T2]]
	;			;
	%t1 = and i32 %a, 255			%t1 = and i32 %a, 255
	%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 24)			%t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 24)
	ret i32 %t2			ret i32 %t2
	}			}

	define i32 @fshl_mask_args_different(i32 %a) {			define i32 @fshl_mask_args_different(i32 %a) {
	; CHECK-LABEL: @fshl_mask_args_different(			; CHECK-LABEL: @fshl_mask_args_different(
	▲ Show 20 Lines • Show All 181 Lines • Show Last 20 Lines