This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/PowerPC/
-
Target/
-
PowerPC/
-
PPCISelLowering.h
1/1
PPCISelLowering.cpp
-
test/CodeGen/PowerPC/
-
CodeGen/
-
PowerPC/
-
bswap64.ll

Differential D39510

[PPC] Use xxbrd to speed up bswap64
ClosedPublic

Authored by Carrot on Nov 1 2017, 2:38 PM.

Download Raw Diff

Details

Reviewers

nemanjai
kbarton
echristo

Commits

rGe3b8d9a312bf: [PPC] Use xxbrd to speed up bswap64
rL317499: [PPC] Use xxbrd to speed up bswap64

Summary

Power doesn't have bswap instructions, so llvm generates following code sequence for bswap64.

rotldi   5, 3, 16
rotldi   4, 3, 8
rotldi   9, 3, 24
rotldi   10, 3, 32
rotldi   11, 3, 48
rotldi   12, 3, 56
rldimi 4, 5, 8, 48
rldimi 4, 9, 16, 40
rldimi 4, 10, 24, 32
rldimi 4, 11, 40, 16
rldimi 4, 12, 48, 8
rldimi 4, 3, 56, 0

But Power9 has vector bswap instructions, they can also be used to speed up scalar bswap intrinsic. With this patch, bswap64 can be translated to:

mtvsrdd 34, 3, 3
xxbrd 34, 34
mfvsrld 3, 34

Diff Detail

Event Timeline

Carrot created this revision.Nov 1 2017, 2:38 PM

This is a great idea considering direct moves are so fast on Power9. I guess we just didn't think of this use when we implemented the vector byte reversal. Thanks for doing this. Other than the rather obvious change to generate the faster mfvsrd instruction, this LGTM.

lib/Target/PowerPC/PPCISelLowering.cpp
8571	Extracting LE doubleword 1 is probably better. It'll produce `mfvsrd` rather than `mfvsrld` on LE systems. The latter uses the permute pipeline and is potentially a higher-latency instruction. And it shouldn't make a functional difference since you're populating both doublewords.

This revision is now accepted and ready to land.Nov 2 2017, 12:31 AM

Will check in this version.

Closed by commit rL317499: [PPC] Use xxbrd to speed up bswap64 (authored by Carrot). · Explain WhyNov 6 2017, 11:10 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

PowerPC/

PPCISelLowering.h

1 line

PPCISelLowering.cpp

22 lines

test/

CodeGen/

PowerPC/

bswap64.ll

13 lines

Diff 121186

lib/Target/PowerPC/PPCISelLowering.h

Show First 20 Lines • Show All 947 Lines • ▼ Show 20 Lines	private:
SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
		SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerCallResult(SDValue Chain, SDValue InFlag,		SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
▲ Show 20 Lines • Show All 150 Lines • Show Last 20 Lines

lib/Target/PowerPC/PPCISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 283 Lines • ▼ Show 20 Lines	if (Subtarget.hasFPRND()) {
setOperationAction(ISD::FROUND, MVT::f64, Legal);		setOperationAction(ISD::FROUND, MVT::f64, Legal);

setOperationAction(ISD::FFLOOR, MVT::f32, Legal);		setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);		setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);		setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);		setOperationAction(ISD::FROUND, MVT::f32, Legal);
}		}

// PowerPC does not have BSWAP		// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
		// to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectivelly		// CTPOP or CTTZ were introduced in P8/P9 respectivelly
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);		setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
if (Subtarget.isISA3_0()) {		if (Subtarget.isISA3_0()) {
		setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);		setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);		setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
} else {		} else {
		setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);		setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);		setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
}		}

if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {		if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
setOperationAction(ISD::CTPOP, MVT::i32 , Legal);		setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
setOperationAction(ISD::CTPOP, MVT::i64 , Legal);		setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
} else {		} else {
▲ Show 20 Lines • Show All 8,243 Lines • ▼ Show 20 Lines	if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) \|\|
(Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))		(Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
if (UI->getOperand(0) == Op.getOperand(0) &&		if (UI->getOperand(0) == Op.getOperand(0) &&
UI->getOperand(1) == Op.getOperand(1))		UI->getOperand(1) == Op.getOperand(1))
return SDValue();		return SDValue();
}		}
return Op;		return Op;
}		}

		// Lower scalar BSWAP64 to xxbrd.
		SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
		SDLoc dl(Op);
		// MTVSRDD
		Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
		Op.getOperand(0));
		// XXBRD
		Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
		// MFVSRLD or MFVSRD
		Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
		DAG.getTargetConstant(0, dl, MVT::i32));
		nemanjaiUnsubmitted Done Reply Inline Actions Extracting LE doubleword 1 is probably better. It'll produce `mfvsrd` rather than `mfvsrld` on LE systems. The latter uses the permute pipeline and is potentially a higher-latency instruction. And it shouldn't make a functional difference since you're populating both doublewords. nemanjai: Extracting LE doubleword 1 is probably better. It'll produce `mfvsrd` rather than `mfvsrld` on…
		return Op;
		}

SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,		SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc dl(Op);		SDLoc dl(Op);
// For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int		// For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
// instructions), but for smaller types, we need to first extend up to v2i32		// instructions), but for smaller types, we need to first extend up to v2i32
// before doing going farther.		// before doing going farther.
if (Op.getValueType() == MVT::v2i64) {		if (Op.getValueType() == MVT::v2i64) {
EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();		EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
▲ Show 20 Lines • Show All 455 Lines • ▼ Show 20 Lines	SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);		case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);		case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

case ISD::INTRINSIC_VOID:		case ISD::INTRINSIC_VOID:
return LowerINTRINSIC_VOID(Op, DAG);		return LowerINTRINSIC_VOID(Op, DAG);
case ISD::SREM:		case ISD::SREM:
case ISD::UREM:		case ISD::UREM:
return LowerREM(Op, DAG);		return LowerREM(Op, DAG);
		case ISD::BSWAP:
		return LowerBSWAP(Op, DAG);
}		}
}		}

void PPCTargetLowering::ReplaceNodeResults(SDNode *N,		void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,		SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc dl(N);		SDLoc dl(N);
switch (N->getOpcode()) {		switch (N->getOpcode()) {
▲ Show 20 Lines • Show All 4,494 Lines • Show Last 20 Lines

test/CodeGen/PowerPC/bswap64.ll

				; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64le-- -mcpu=pwr9 \| FileCheck %s

				declare i64 @llvm.bswap.i64(i64)

				; CHECK: mtvsrdd
				; CHECK: xxbrd
				; CHECK: mfvsrld
				define i64 @bswap64(i64 %x) {
				entry:
				%0 = call i64 @llvm.bswap.i64(i64 %x)
				ret i64 %0
				}