This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use a shorter sequence to implement FLT_ROUNDS
ClosedPublic

Authored by craig.topper on Jan 28 2020, 5:50 PM.

Download Raw Diff

Details

Reviewers

andrew.w.kaylor
RKSimon
spatel

Commits

rGe5edd641fde0: [X86] Use a shorter sequence to implement FLT_ROUNDS

Summary

This code needs to map from the FPCW 2-bit encoding for rounding mode to the 2-bit encoding defined for FLT_ROUNDS. The previous implementation did some clever swapping of bits and adding 1 modulo 4 to do the mapping.

This patch instead uses an 8-bit immediate as a lookup table of four 2-bit values. Then we use the 2-bit FPCW encoding to index the lookup table by using a right shift and an AND. This requires extracting the 2-bit value from FPCW and multipying it by 2 to make it usable as a shift amount. But still results in less code.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

craig.topper created this revision.Jan 28 2020, 5:50 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 28 2020, 5:50 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Use MVT::i8 for a shift amount instead of MVT::i16. The DAG legalizer was fixing it anyway, but using MVT::i8 is more correct.

LGTM

llvm/lib/Target/X86/X86ISelLowering.cpp
25430–25434	Nice hack. :) We could use a code comment to explain this better: 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

This revision is now accepted and ready to land.Jan 29 2020, 5:31 AM

Closed by commit rGe5edd641fde0: [X86] Use a shorter sequence to implement FLT_ROUNDS (authored by craig.topper). · Explain WhyJan 29 2020, 9:02 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

28 lines

test/

CodeGen/

X86/

flt-rounds.ll

22 lines

Diff 241177

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 25,421 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

FLT_ROUNDS, on the other hand, expects the following:		FLT_ROUNDS, on the other hand, expects the following:
-1 Undefined		-1 Undefined
0 Round to 0		0 Round to 0
1 Round to nearest		1 Round to nearest
2 Round to +inf		2 Round to +inf
3 Round to -inf		3 Round to -inf

To perform the conversion, we do:		To perform the conversion, we use a packed lookup table of the four 2-bit
(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)		values that we can index by FPSP[11:10]
		0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

		(0x2d >> ((FPSR & 0xc00) >> 9)) & 3
		spatelUnsubmitted Not Done Reply Inline Actions Nice hack. :) We could use a code comment to explain this better: 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10] spatel: Nice hack. :) We could use a code comment to explain this better: 0x2d --> (0b00,10,11,01) -->…
*/		*/

MachineFunction &MF = DAG.getMachineFunction();		MachineFunction &MF = DAG.getMachineFunction();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();		const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Align StackAlignment(TFI.getStackAlignment());		const Align StackAlignment(TFI.getStackAlignment());
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);		SDLoc DL(Op);

Show All 11 Lines	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,		SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),		DAG.getVTList(MVT::Other),
Ops, MVT::i16, MMO);		Ops, MVT::i16, MMO);

// Load FP Control Word from stack slot		// Load FP Control Word from stack slot
SDValue CWD =		SDValue CWD =
DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());		DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

// Transform as necessary		// Mask and turn the control bits into a shift for the lookup table.
SDValue CWD1 =		SDValue Shift =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
CWD, DAG.getConstant(0x800, DL, MVT::i16)),
DAG.getConstant(11, DL, MVT::i8));
SDValue CWD2 =
DAG.getNode(ISD::SRL, DL, MVT::i16,		DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,		DAG.getNode(ISD::AND, DL, MVT::i16,
CWD, DAG.getConstant(0x400, DL, MVT::i16)),		CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
DAG.getConstant(9, DL, MVT::i8));		DAG.getConstant(9, DL, MVT::i8));
		Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

		SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
SDValue RetVal =		SDValue RetVal =
DAG.getNode(ISD::AND, DL, MVT::i16,		DAG.getNode(ISD::AND, DL, MVT::i32,
DAG.getNode(ISD::ADD, DL, MVT::i16,		DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),		DAG.getConstant(3, DL, MVT::i32));
DAG.getConstant(1, DL, MVT::i16)),
DAG.getConstant(3, DL, MVT::i16));

return DAG.getZExtOrTrunc(RetVal, DL, VT);		return DAG.getZExtOrTrunc(RetVal, DL, VT);
}		}

// Split an unary integer op into 2 half sized ops.		// Split an unary integer op into 2 half sized ops.
static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {		static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
▲ Show 20 Lines • Show All 22,047 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/flt-rounds.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X86,X86-NOSSE			; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X86,X86-NOSSE
	; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse2 -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X86,X86-SSE			; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse2 -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X86,X86-SSE
	; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X64			; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=X64

	declare i32 @llvm.flt.rounds()			declare i32 @llvm.flt.rounds()

	define i32 @test_flt_rounds() nounwind {			define i32 @test_flt_rounds() nounwind {
	; X86-LABEL: test_flt_rounds:			; X86-LABEL: test_flt_rounds:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: subl $12, %esp			; X86-NEXT: subl $12, %esp
	; X86-NEXT: fnstcw (%esp)			; X86-NEXT: fnstcw (%esp)
	; X86-NEXT: movl (%esp), %eax			; X86-NEXT: movzwl (%esp), %ecx
	; X86-NEXT: movl %eax, %ecx
	; X86-NEXT: shrl $9, %ecx			; X86-NEXT: shrl $9, %ecx
	; X86-NEXT: andl $2, %ecx			; X86-NEXT: andb $6, %cl
	; X86-NEXT: shrl $11, %eax			; X86-NEXT: movl $45, %eax
	; X86-NEXT: andl $1, %eax			; X86-NEXT: # kill: def $cl killed $cl killed $ecx
	; X86-NEXT: leal 1(%eax,%ecx), %eax			; X86-NEXT: shrl %cl, %eax
	; X86-NEXT: andl $3, %eax			; X86-NEXT: andl $3, %eax
	; X86-NEXT: addl $12, %esp			; X86-NEXT: addl $12, %esp
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: test_flt_rounds:			; X64-LABEL: test_flt_rounds:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp)			; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp)
	; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax			; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
	; X64-NEXT: movl %eax, %ecx
	; X64-NEXT: shrl $9, %ecx			; X64-NEXT: shrl $9, %ecx
	; X64-NEXT: andl $2, %ecx			; X64-NEXT: andb $6, %cl
	; X64-NEXT: shrl $11, %eax			; X64-NEXT: movl $45, %eax
	; X64-NEXT: andl $1, %eax			; X64-NEXT: # kill: def $cl killed $cl killed $ecx
	; X64-NEXT: leal 1(%rax,%rcx), %eax			; X64-NEXT: shrl %cl, %eax
	; X64-NEXT: andl $3, %eax			; X64-NEXT: andl $3, %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = call i32 @llvm.flt.rounds()			%1 = call i32 @llvm.flt.rounds()
	ret i32 %1			ret i32 %1
	}			}