This is an archive of the discontinued LLVM Phabricator instance.

[X86][BITREVERSE] Optimized bitreverse builtin for 8 bit scalar integer (PR31810)
AbandonedPublic

Authored by kbelochapka on Jul 18 2017, 7:13 PM.

Download Raw Diff

Details

Reviewers

craig.topper
RKSimon
spatel

Summary

manually optimized code for 8 bit integer bit reverse builtin, instruction count reduced from 14 to 11, slight performance gain

Diff Detail

Event Timeline

kbelochapka created this revision.Jul 18 2017, 7:13 PM

kbelochapka retitled this revision from [X86][BITREVERSE] Optimized bitreverse builtin for 8 bit scalar integer to [X86][BITREVERSE] Optimized bitreverse builtin for 8 bit scalar integer (PR31810).Jul 18 2017, 7:17 PM

kbelochapka edited the summary of this revision. (Show Details)

kbelochapka added a reviewer: craig.topper.

craig.topper added reviewers: RKSimon, spatel.Jul 18 2017, 7:19 PM

The performance implications on PR31810 need to be settled before this can be taken any further

lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
2613	This is general legalization code - we should ensure that ISD::ROTL is legalorcustom before attempting this path, otherwise the additional shift/mask code will definitely be a regression. With that in place you can probably drop the VT.isScalarInteger() requirement.
test/CodeGen/X86/bitreverse.ll
338	Please use the update_llc_test_checks.py script

The performance measurement testing did not show any performance benefit for the proposed 8bit bit reversal intrinsic implementation over the existing one.

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

LegalizeDAG.cpp

37 lines

test/

CodeGen/

X86/

bitreverse.ll

109 lines

Diff 107232

lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Show First 20 Lines • Show All 2,562 Lines • ▼ Show 20 Lines	SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
// Okay, we found the operation and type to use.		// Okay, we found the operation and type to use.
SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);		SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);

// Truncate the result of the extended FP_TO_*INT operation to the desired		// Truncate the result of the extended FP_TO_*INT operation to the desired
// size.		// size.
return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);		return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
}		}

		// Optimized vesrions of 8 bit BITREVERSE.
		static SDValue ExpandBITREVERSE8(SelectionDAG &DAG, SDValue Op,
		const SDLoc &dl) {
		EVT VT = Op.getValueType();
		unsigned Sz = VT.getScalarSizeInBits();

		assert(Sz == 8 && "Expected 8 bit size");

		SDValue AL, CL, BL;
		APInt Hex1H(Sz, 0x1), Hex2H(Sz, 0x2), Hex4H(Sz, 0x4);
		APInt Hex55H(Sz, 0x55), Hex66H(Sz, 0x66);

		SDValue Constant1H(DAG.getConstant(Hex1H, dl, VT));
		SDValue Constant2H(DAG.getConstant(Hex2H, dl, VT));
		SDValue Constant4H(DAG.getConstant(Hex4H, dl, VT));
		SDValue Constant55H(DAG.getConstant(Hex55H, dl, VT));
		SDValue Constant66H(DAG.getConstant(Hex66H, dl, VT));

		BL = Op;
		AL = DAG.getBitcast(VT, BL); // MOV AL,BL
		BL = DAG.getNode(ISD::AND, dl, VT, BL, Constant55H); // AND BL,55H
		AL = DAG.getNode(ISD::XOR, dl, VT, AL, BL); // XOR AL,BL
		BL = DAG.getNode(ISD::ROTL, dl, VT, BL, Constant2H); // ROL BL,2
		AL = DAG.getNode(ISD::OR, dl, VT, BL, AL); // OR AL,BL
		CL = DAG.getBitcast(VT, AL); // MOV CL,AL
		AL = DAG.getNode(ISD::AND, dl, VT, AL, Constant66H); // AND AL,66H
		CL = DAG.getNode(ISD::XOR, dl, VT, AL, CL); // XOR CL,AL
		AL = DAG.getNode(ISD::ROTL, dl, VT, AL, Constant4H); // ROL AL,4
		AL = DAG.getNode(ISD::OR, dl, VT, AL, CL); // OR AL,CL
		AL = DAG.getNode(ISD::ROTL, dl, VT, AL, Constant1H); // ROL AL,1

		return AL;
		}

/// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.		/// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.
SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {		SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());		EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
unsigned Sz = VT.getScalarSizeInBits();		unsigned Sz = VT.getScalarSizeInBits();

SDValue Tmp, Tmp2, Tmp3;		SDValue Tmp, Tmp2, Tmp3;

		if (Sz == 8 && VT.isScalarInteger()) {
		RKSimonUnsubmitted Not Done Reply Inline Actions This is general legalization code - we should ensure that ISD::ROTL is legalorcustom before attempting this path, otherwise the additional shift/mask code will definitely be a regression. With that in place you can probably drop the VT.isScalarInteger() requirement. RKSimon: This is general legalization code - we should ensure that ISD::ROTL is legalorcustom before…
		return ExpandBITREVERSE8(DAG, Op, dl);
		}
// If we can, perform BSWAP first and then the mask+swap the i4, then i2		// If we can, perform BSWAP first and then the mask+swap the i4, then i2
// and finally the i1 pairs.		// and finally the i1 pairs.
// TODO: We can easily support i4/i2 legal types if any target ever does.		// TODO: We can easily support i4/i2 legal types if any target ever does.
if (Sz >= 8 && isPowerOf2_32(Sz)) {		if (Sz >= 8 && isPowerOf2_32(Sz)) {
// Create the masks - repeating the pattern every byte.		// Create the masks - repeating the pattern every byte.
APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);		APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);		APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
for (unsigned J = 0; J != Sz; J += 8) {		for (unsigned J = 0; J != Sz; J += 8) {
▲ Show 20 Lines • Show All 2,061 Lines • Show Last 20 Lines

test/CodeGen/X86/bitreverse.ll

Show First 20 Lines • Show All 317 Lines • ▼ Show 20 Lines	; X64-NEXT: retq
ret i16 %b		ret i16 %b
}		}

declare i8 @llvm.bitreverse.i8(i8) readnone		declare i8 @llvm.bitreverse.i8(i8) readnone

define i8 @test_bitreverse_i8(i8 %a) {		define i8 @test_bitreverse_i8(i8 %a) {
; X86-LABEL: test_bitreverse_i8:		; X86-LABEL: test_bitreverse_i8:
; X86: # BB#0:		; X86: # BB#0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al		; X86-NEXT: movb {{[0-9]+}}(%esp), [[AL:%[a-s][i]*[l]]]
; X86-NEXT: rolb $4, %al		; X86-NEXT: movl [[EAX:%[e\|r][a-z][x\|i]]], [[ECX:%[e\|r][a-z][x\|i]]]
; X86-NEXT: movl %eax, %ecx		; X86-NEXT: andb $85, [[CL:%[a-s][i]*[l]]]
; X86-NEXT: andb $51, %cl		; X86-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]]
; X86-NEXT: shlb $2, %cl		; X86-NEXT: rolb $2, [[CL]]
; X86-NEXT: andb $-52, %al		; X86-NEXT: orb [[DL]], [[CL]]
; X86-NEXT: shrb $2, %al		; X86-NEXT: movl [[ECX]], [[EAX]]
; X86-NEXT: orb %cl, %al		; X86-NEXT: andb $102, [[AL:%[a-s][i]*[l]]]
; X86-NEXT: movl %eax, %ecx		; X86-NEXT: xorb [[AL]], [[CL]]
; X86-NEXT: andb $85, %cl		; X86-NEXT: rolb $4, [[AL]]
; X86-NEXT: addb %cl, %cl		; X86-NEXT: orb [[CL]], [[AL]]
; X86-NEXT: andb $-86, %al		; X86-NEXT: rolb [[AL]]
; X86-NEXT: shrb %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: retl		; X86-NEXT: retl
		RKSimonUnsubmitted Not Done Reply Inline Actions Please use the update_llc_test_checks.py script RKSimon: Please use the update_llc_test_checks.py script
;
; X64-LABEL: test_bitreverse_i8:		; X64-LABEL: test_bitreverse_i8:
; X64: # BB#0:		; X64: # BB#0:
; X64-NEXT: rolb $4, %dil		; X64-NEXT: movl [[EDI:%[e\|r][a-z][x\|i]]], [[ECX:%[e\|r][a-z][x\|i]]]
; X64-NEXT: movl %edi, %eax		; X64-NEXT: andb $85, [[CL:%[a-s][i]*[l]]]
; X64-NEXT: andb $51, %al		; X64-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]]
; X64-NEXT: shlb $2, %al		; X64-NEXT: rolb $2, [[CL]]
; X64-NEXT: andb $-52, %dil		; X64-NEXT: orb [[DL]], [[CL]]
; X64-NEXT: shrb $2, %dil		; X64-NEXT: movl [[ECX]], [[EAX:%[e\|r][a-z][x\|i]]]
; X64-NEXT: orb %al, %dil		; X64-NEXT: andb $102, [[AL:%[a-s][i]*[l]]]
; X64-NEXT: movl %edi, %eax		; X64-NEXT: xorb [[AL]], [[CL]]
; X64-NEXT: andb $85, %al		; X64-NEXT: rolb $4, [[AL]]
; X64-NEXT: addb %al, %al		; X64-NEXT: orb [[CL]], [[AL]]
; X64-NEXT: andb $-86, %dil		; X64-NEXT: rolb [[AL]]
; X64-NEXT: shrb %dil
; X64-NEXT: orb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq		; X64-NEXT: retq

%b = call i8 @llvm.bitreverse.i8(i8 %a)		%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b		ret i8 %b
}		}

declare i4 @llvm.bitreverse.i4(i4) readnone		declare i4 @llvm.bitreverse.i4(i4) readnone

define i4 @test_bitreverse_i4(i4 %a) {		define i4 @test_bitreverse_i4(i4 %a) {
; X86-LABEL: test_bitreverse_i4:		; X86-LABEL: test_bitreverse_i4:
; X86: # BB#0:		; X86: # BB#0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al		; X86-NEXT: movb {{[0-9]+}}(%esp), [[AL:%[a-s][i]*[l]]]
; X86-NEXT: rolb $4, %al		; X86-NEXT: movl [[EAX:%[e\|r][a-z][x\|i]]], [[ECX:%[e\|r][a-z][x\|i]]]
; X86-NEXT: movl %eax, %ecx		; X86-NEXT: andb $85, [[CL:%[a-s][i]*[l]]]
; X86-NEXT: andb $51, %cl		; X86-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]]
; X86-NEXT: shlb $2, %cl		; X86-NEXT: rolb $2, [[CL]]
; X86-NEXT: andb $-52, %al		; X86-NEXT: orb [[DL]], [[CL]]
; X86-NEXT: shrb $2, %al		; X86-NEXT: movl [[ECX]], [[EAX]]
; X86-NEXT: orb %cl, %al		; X86-NEXT: andb $102, [[AL:%[a-s][i]*[l]]]
; X86-NEXT: movl %eax, %ecx		; X86-NEXT: xorb [[AL]], [[CL]]
; X86-NEXT: andb $80, %cl		; X86-NEXT: rolb $4, [[AL]]
; X86-NEXT: addb %cl, %cl		; X86-NEXT: orb [[CL]], [[AL]]
; X86-NEXT: andb $-96, %al		; X86-NEXT: rolb [[AL]]
; X86-NEXT: shrb %al		; X86-NEXT: shrb $4, [[AL]]
; X86-NEXT: orb %cl, %al
; X86-NEXT: shrb $4, %al
; X86-NEXT: retl		; X86-NEXT: retl
;		;
; X64-LABEL: test_bitreverse_i4:		; X64-LABEL: test_bitreverse_i4:
; X64: # BB#0:		; X64: # BB#0:
; X64-NEXT: rolb $4, %dil		; X64-NEXT: movl [[EDI:%[e\|r][a-z][x\|i]]], [[ECX:%[e\|r][a-z][x\|i]]]
; X64-NEXT: movl %edi, %eax		; X64-NEXT: andb $85, [[CL:%[a-s][i]*[l]]]
; X64-NEXT: andb $51, %al		; X64-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]]
; X64-NEXT: shlb $2, %al		; X64-NEXT: rolb $2, [[CL]]
; X64-NEXT: andb $-52, %dil		; X64-NEXT: orb [[DL]], [[CL]]
; X64-NEXT: shrb $2, %dil		; X64-NEXT: movl [[ECX]], [[EAX:%[e\|r][a-z][x\|i]]]
; X64-NEXT: orb %al, %dil		; X64-NEXT: andb $102, [[AL:%[a-s][i]*[l]]]
; X64-NEXT: movl %edi, %eax		; X64-NEXT: xorb [[AL]], [[CL]]
; X64-NEXT: andb $80, %al		; X64-NEXT: rolb $4, [[AL]]
; X64-NEXT: addb %al, %al		; X64-NEXT: orb [[CL]], [[AL]]
; X64-NEXT: andb $-96, %dil		; X64-NEXT: rolb [[AL]]
; X64-NEXT: shrb %dil		; X64-NEXT: shrb $4, [[AL]]
; X64-NEXT: orb %al, %dil
; X64-NEXT: shrb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq		; X64-NEXT: retq
%b = call i4 @llvm.bitreverse.i4(i4 %a)		%b = call i4 @llvm.bitreverse.i4(i4 %a)
ret i4 %b		ret i4 %b
}		}

; These tests check that bitreverse(constant) calls are folded		; These tests check that bitreverse(constant) calls are folded

define <2 x i16> @fold_v2i16() {		define <2 x i16> @fold_v2i16() {
▲ Show 20 Lines • Show All 113 Lines • Show Last 20 Lines