This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Use generic bitreverse intrinsic
ClosedPublic

Authored by arsenm on Nov 30 2015, 1:54 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
jmolloy

Summary

Also fix bug in vector legalization for bitreverse.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 41432.Nov 30 2015, 1:54 PM

arsenm retitled this revision from to AMDGPU: Use generic bitreverse intrinsic.

arsenm updated this object.

arsenm added reviewers: • tstellarAMD, jmolloy.

arsenm added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptNov 30 2015, 1:54 PM

• tstellarAMD added inline comments.Dec 1 2015, 8:27 AM

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
1039–1041	We have to keep this intrinsic, because we are using it in Mesa.

• tstellarAMD requested changes to this revision.Dec 1 2015, 9:00 AM

• tstellarAMD edited edge metadata.

This revision now requires changes to proceed.Dec 1 2015, 9:00 AM

Add compatibility with old intrinsic name

LGTM.

This revision is now accepted and ready to land.Dec 1 2015, 1:33 PM

Actually the AArch64 bitreverse test is broken from the legalization fix. It looks like it now gets scalarized and then fully expanded instead of the expansion with vector ops.

Fix ARM test failures. If the required vector bit instruction are available, defer legalization to LegalizeDAG.

At Matt's request I looked at this; it looks fine to me. I'm glad the AArch64 bitreverse test correctly caught the poor expansion!

r255512

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

LegalizeVectorOps.cpp

23 lines

Target/

AMDGPU/

AMDGPUISelLowering.h

1 line

AMDGPUISelLowering.cpp

6 lines

AMDGPUInstrInfo.td

2 lines

SIISelLowering.cpp

1 line

SIInstructions.td

2 lines

test/

CodeGen/

AMDGPU/

bitreverse.ll

115 lines

llvm.AMDGPU.brev.ll

Diff 42590

lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines	class VectorLegalizer {
/// \brief Implement vselect in terms of XOR, AND, OR when blend is not		/// \brief Implement vselect in terms of XOR, AND, OR when blend is not
/// supported by the target.		/// supported by the target.
SDValue ExpandVSELECT(SDValue Op);		SDValue ExpandVSELECT(SDValue Op);
SDValue ExpandSELECT(SDValue Op);		SDValue ExpandSELECT(SDValue Op);
SDValue ExpandLoad(SDValue Op);		SDValue ExpandLoad(SDValue Op);
SDValue ExpandStore(SDValue Op);		SDValue ExpandStore(SDValue Op);
SDValue ExpandFNEG(SDValue Op);		SDValue ExpandFNEG(SDValue Op);
SDValue ExpandABSDIFF(SDValue Op);		SDValue ExpandABSDIFF(SDValue Op);
		SDValue ExpandBITREVERSE(SDValue Op);

/// \brief Implements vector promotion.		/// \brief Implements vector promotion.
///		///
/// This is essentially just bitcasting the operands to a different type and		/// This is essentially just bitcasting the operands to a different type and
/// bitcasting the result back to the original type.		/// bitcasting the result back to the original type.
SDValue Promote(SDValue Op);		SDValue Promote(SDValue Op);

/// \brief Implements [SU]INT_TO_FP vector promotion.		/// \brief Implements [SU]INT_TO_FP vector promotion.
▲ Show 20 Lines • Show All 159 Lines • ▼ Show 20 Lines	SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::OR:		case ISD::OR:
case ISD::XOR:		case ISD::XOR:
case ISD::SHL:		case ISD::SHL:
case ISD::SRA:		case ISD::SRA:
case ISD::SRL:		case ISD::SRL:
case ISD::ROTL:		case ISD::ROTL:
case ISD::ROTR:		case ISD::ROTR:
case ISD::BSWAP:		case ISD::BSWAP:
		case ISD::BITREVERSE:
case ISD::CTLZ:		case ISD::CTLZ:
case ISD::CTTZ:		case ISD::CTTZ:
case ISD::CTLZ_ZERO_UNDEF:		case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTTZ_ZERO_UNDEF:		case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTPOP:		case ISD::CTPOP:
case ISD::SELECT:		case ISD::SELECT:
case ISD::VSELECT:		case ISD::VSELECT:
case ISD::SELECT_CC:		case ISD::SELECT_CC:
▲ Show 20 Lines • Show All 424 Lines • ▼ Show 20 Lines	case ISD::UINT_TO_FP:
return ExpandUINT_TO_FLOAT(Op);		return ExpandUINT_TO_FLOAT(Op);
case ISD::FNEG:		case ISD::FNEG:
return ExpandFNEG(Op);		return ExpandFNEG(Op);
case ISD::SETCC:		case ISD::SETCC:
return UnrollVSETCC(Op);		return UnrollVSETCC(Op);
case ISD::UABSDIFF:		case ISD::UABSDIFF:
case ISD::SABSDIFF:		case ISD::SABSDIFF:
return ExpandABSDIFF(Op);		return ExpandABSDIFF(Op);
		case ISD::BITREVERSE:
		return ExpandBITREVERSE(Op);
default:		default:
return DAG.UnrollVectorOp(Op.getNode());		return DAG.UnrollVectorOp(Op.getNode());
}		}
}		}

SDValue VectorLegalizer::ExpandABSDIFF(SDValue Op) {		SDValue VectorLegalizer::ExpandABSDIFF(SDValue Op) {
SDLoc dl(Op);		SDLoc dl(Op);
SDValue Op0 = Op.getOperand(0);		SDValue Op0 = Op.getOperand(0);
▲ Show 20 Lines • Show All 197 Lines • ▼ Show 20 Lines	SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {

SDLoc DL(Op);		SDLoc DL(Op);
Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));		Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),		Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
ShuffleMask.data());		ShuffleMask.data());
return DAG.getNode(ISD::BITCAST, DL, VT, Op);		return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}		}

		SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
		EVT VT = Op.getValueType();

		// If we have the scalar operation, it's probably cheaper to unroll it.
		if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
		return DAG.UnrollVectorOp(Op.getNode());

		// If we have the appropriate vector bit operations, it is better to use them
		// than unrolling and expanding each component.
		if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) \|\|
		!TLI.isOperationLegalOrCustom(ISD::SRL, VT) \|\|
		!TLI.isOperationLegalOrCustom(ISD::AND, VT) \|\|
		!TLI.isOperationLegalOrCustom(ISD::OR, VT))
		return DAG.UnrollVectorOp(Op.getNode());

		// Let LegalizeDAG handle this later.
		return Op;
		}

SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {		SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
// Implement VSELECT in terms of XOR, AND, OR		// Implement VSELECT in terms of XOR, AND, OR
// on platforms which do not support blend natively.		// on platforms which do not support blend natively.
SDLoc DL(Op);		SDLoc DL(Op);

SDValue Mask = Op.getOperand(0);		SDValue Mask = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);		SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);		SDValue Op2 = Op.getOperand(2);
▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 257 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
FP_CLASS,		FP_CLASS,
DOT4,		DOT4,
CARRY,		CARRY,
BORROW,		BORROW,
BFE_U32, // Extract range of bits with zero extension to 32-bits.		BFE_U32, // Extract range of bits with zero extension to 32-bits.
BFE_I32, // Extract range of bits with sign extension to 32-bits.		BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) \| (~src0 & src2)		BFI, // (src0 & src1) \| (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.		BFM, // Insert a range of bits into a 32-bit word.
BREV, // Reverse bits.
MUL_U24,		MUL_U24,
MUL_I24,		MUL_I24,
MAD_U24,		MAD_U24,
MAD_I24,		MAD_I24,
TEXTURE_FETCH,		TEXTURE_FETCH,
EXPORT,		EXPORT,
CONST_ADDRESS,		CONST_ADDRESS,
REGISTER_LOAD,		REGISTER_LOAD,
Show All 40 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 1,030 Lines • ▼ Show 20 Lines	case AMDGPUIntrinsic::AMDGPU_bfi:
Op.getOperand(2),		Op.getOperand(2),
Op.getOperand(3));		Op.getOperand(3));

case AMDGPUIntrinsic::AMDGPU_bfm:		case AMDGPUIntrinsic::AMDGPU_bfm:
return DAG.getNode(AMDGPUISD::BFM, DL, VT,		return DAG.getNode(AMDGPUISD::BFM, DL, VT,
Op.getOperand(1),		Op.getOperand(1),
Op.getOperand(2));		Op.getOperand(2));

case AMDGPUIntrinsic::AMDGPU_brev:
return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));

tstellarAMDUnsubmitted Not Done Reply Inline Actions We have to keep this intrinsic, because we are using it in Mesa. tstellarAMD: We have to keep this intrinsic, because we are using it in Mesa.
case Intrinsic::AMDGPU_class:		case Intrinsic::AMDGPU_class:
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,		return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));

case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.		case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));		return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));

case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.		case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));		return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.		case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));		return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
		case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
		return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
}		}
}		}

///IABS(a) = SMAX(sub(0, a), a)		///IABS(a) = SMAX(sub(0, a), a)
SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,		SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc DL(Op);		SDLoc DL(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
▲ Show 20 Lines • Show All 1,625 Lines • ▼ Show 20 Lines	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FP_CLASS)		NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)		NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(CARRY)		NODE_NAME_CASE(CARRY)
NODE_NAME_CASE(BORROW)		NODE_NAME_CASE(BORROW)
NODE_NAME_CASE(BFE_U32)		NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)		NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)		NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)		NODE_NAME_CASE(BFM)
NODE_NAME_CASE(BREV)
NODE_NAME_CASE(MUL_U24)		NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)		NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MAD_U24)		NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)		NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(TEXTURE_FETCH)		NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)		NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)		NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)		NODE_NAME_CASE(REGISTER_LOAD)
▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUInstrInfo.td

	Show First 20 Lines • Show All 185 Lines • ▼ Show 20 Lines
	def AMDGPUround : SDNode<"ISD::FROUND",			def AMDGPUround : SDNode<"ISD::FROUND",
	SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;			SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;

	def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;			def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;			def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;			def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;			def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;

	def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;

	// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when			// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
	// performing the mulitply. The result is a 32-bit value.			// performing the mulitply. The result is a 32-bit value.
	def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,			def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
	[SDNPCommutative]			[SDNPCommutative]
	>;			>;
	def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,			def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
	[SDNPCommutative]			[SDNPCommutative]
	>;			>;
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);		setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);		setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);		setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);

setOperationAction(ISD::SETCC, MVT::v2i1, Expand);		setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);		setOperationAction(ISD::SETCC, MVT::v4i1, Expand);

setOperationAction(ISD::BSWAP, MVT::i32, Legal);		setOperationAction(ISD::BSWAP, MVT::i32, Legal);
		setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);		setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
▲ Show 20 Lines • Show All 2,377 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
[(set i64:$dst, (not i64:$src0))]		[(set i64:$dst, (not i64:$src0))]
>;		>;
defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;		defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;		defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
} // End Defs = [SCC]		} // End Defs = [SCC]


defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",		defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
[(set i32:$dst, (AMDGPUbrev i32:$src0))]		[(set i32:$dst, (bitreverse i32:$src0))]
>;		>;
defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;		defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;

let Defs = [SCC] in {		let Defs = [SCC] in {
defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;		defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;		defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",		defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
[(set i32:$dst, (ctpop i32:$src0))]		[(set i32:$dst, (ctpop i32:$src0))]
▲ Show 20 Lines • Show All 3,134 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/bitreverse.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s

				declare i16 @llvm.bitreverse.i16(i16) #1
				declare i32 @llvm.bitreverse.i32(i32) #1
				declare i64 @llvm.bitreverse.i64(i64) #1

				declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
				declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1

				declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
				declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1

				declare i32 @llvm.AMDGPU.brev(i32) #1

				; FUNC-LABEL: {{^}}s_brev_i16:
				; SI: s_brev_b32
				define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
				%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
				store i16 %brev, i16 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_brev_i16:
				; SI: v_bfrev_b32_e32
				define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
				%val = load i16, i16 addrspace(1)* %valptr
				%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
				store i16 %brev, i16 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}s_brev_i32:
				; SI: s_load_dword [[VAL:s[0-9]+]],
				; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
				; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
				; SI: buffer_store_dword [[VRESULT]],
				; SI: s_endpgm
				define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
				%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
				store i32 %brev, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_brev_i32:
				; SI: buffer_load_dword [[VAL:v[0-9]+]],
				; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
				; SI: buffer_store_dword [[RESULT]],
				; SI: s_endpgm
				define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
				%val = load i32, i32 addrspace(1)* %valptr
				%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
				store i32 %brev, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}s_brev_v2i32:
				; SI: s_brev_b32
				; SI: s_brev_b32
				define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
				%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
				store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_brev_v2i32:
				; SI: v_bfrev_b32_e32
				; SI: v_bfrev_b32_e32
				define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
				%val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
				%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
				store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}s_brev_i64:
				define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
				%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
				store i64 %brev, i64 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_brev_i64:
				define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
				%val = load i64, i64 addrspace(1)* %valptr
				%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
				store i64 %brev, i64 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}s_brev_v2i64:
				define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
				%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
				store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_brev_v2i64:
				define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
				%val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
				%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
				store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}legacy_s_brev_i32:
				; SI: s_brev_b32
				define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
				%brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1
				store i32 %brev, i32 addrspace(1)* %out
				ret void
				}

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll

This file was deleted.

	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s

	declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone

	; FUNC-LABEL: {{^}}s_brev_i32:
	; SI: s_load_dword [[VAL:s[0-9]+]],
	; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
	; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
	; SI: buffer_store_dword [[VRESULT]],
	; SI: s_endpgm
	define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
	%ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
	store i32 %ctlz, i32 addrspace(1)* %out, align 4
	ret void
	}

	; FUNC-LABEL: {{^}}v_brev_i32:
	; SI: buffer_load_dword [[VAL:v[0-9]+]],
	; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
	; SI: buffer_store_dword [[RESULT]],
	; SI: s_endpgm
	define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
	%val = load i32, i32 addrspace(1)* %valptr, align 4
	%ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
	store i32 %ctlz, i32 addrspace(1)* %out, align 4
	ret void
	}