This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add new BFI intrinsic
Needs RevisionPublic

Authored by tsymalla on Jul 4 2023, 2:31 AM.

Download Raw Diff

Details

Reviewers

foad
arsenm

Summary

This adds a new BFI intrinsic which can be used to emit the v_bfi instruction
directly with a mask.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

tsymalla created this revision.Jul 4 2023, 2:31 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 4 2023, 2:31 AM

Herald added subscribers: StephenFan, kerbowa, hiraditya and 5 others. · View Herald Transcript

tsymalla requested review of this revision.Jul 4 2023, 2:31 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 4 2023, 2:31 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

We’ve specifically avoided adding intrinsics for easy to match instructions. This needs a semantic justification over just emitting the expanded bit sequence. It’s a huge amount of work teaching every part of the compiler the equivalent bit optimizations.

llvm/test/CodeGen/AMDGPU/bfi_nested.ll
3	Why lose a test

This revision now requires changes to proceed.Jul 4 2023, 2:43 AM

In D154422#4470693, @arsenm wrote:

We’ve specifically avoided adding intrinsics for easy to match instructions. This needs a semantic justification over just emitting the expanded bit sequence. It’s a huge amount of work teaching every part of the compiler the equivalent bit optimizations.

Unfortunately, in the case of BFI instructions, these are not so easy to match in LLVM. There is some unfortunate stuff going on that prevents us from generating nested v_bfi instructions (e. g. one BFI as base of another etc.) - which in turn lets us generate way less v_bfi instructions than we could. I have been working on that for a while on https://reviews.llvm.org/D136432, and it is not so easy to get it working properly. From my tests, it seems, that there is no real advantage in terms of codegen when trying to match the and / or patterns, so I'd thought that emitting the intrinsic directly would be sufficient and an improvement over the current state. This should not replace the few existing BFI ISel patterns but rather serve as a way to teach the middle-end to generate BFI instructions.

llvm/test/CodeGen/AMDGPU/bfi_nested.ll
3	Because that was implemented as base for the bementioned, abandoned BFI patch which was never merged.

Harbormaster completed remote builds in B242974: Diff 537001.Jul 4 2023, 4:11 AM

jsilvanus added a subscriber: jsilvanus.Jul 5 2023, 12:47 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

IR/

IntrinsicsAMDGPU.td

5 lines

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

21 lines

AMDGPUInstrInfo.td

6 lines

AMDGPURegisterBankInfo.cpp

1 line

EvergreenInstructions.td

2 lines

SIISelLowering.cpp

3 lines

test/

CodeGen/

AMDGPU/

bfi-intrinsic.ll

66 lines

bfi_nested.ll

Diff 537001

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Show First 20 Lines • Show All 1,851 Lines • ▼ Show 20 Lines	def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
[IntrNoMem, IntrSpeculatable]		[IntrNoMem, IntrSpeculatable]
>;		>;

def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],		def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],		[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]		[IntrNoMem, IntrSpeculatable]
>;		>;

		def int_amdgcn_bfi : DefaultAttrsIntrinsic<[llvm_i32_ty],
		[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrSpeculatable]
		>;

def int_amdgcn_lerp :		def int_amdgcn_lerp :
ClangBuiltin<"__builtin_amdgcn_lerp">,		ClangBuiltin<"__builtin_amdgcn_lerp">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],		DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]		[IntrNoMem, IntrSpeculatable]
>;		>;

def int_amdgcn_sad_u8 :		def int_amdgcn_sad_u8 :
ClangBuiltin<"__builtin_amdgcn_sad_u8">,		ClangBuiltin<"__builtin_amdgcn_sad_u8">,
▲ Show 20 Lines • Show All 898 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 601 Lines • ▼ Show 20 Lines	else {
ConstantSDNode *C = cast<ConstantSDNode>(N);		ConstantSDNode *C = cast<ConstantSDNode>(N);
Imm = C->getZExtValue();		Imm = C->getZExtValue();
}		}

SDLoc DL(N);		SDLoc DL(N);
ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));		ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;		return;
}		}
		case AMDGPUISD::BFI: {
		ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(0));
		if (!Mask)
		break;

		if (Mask->getZExtValue() == 0) {
		ReplaceNode(N, N->getOperand(2).getNode());
		return;
		}

		if (Mask->getSExtValue() == -1) {
		ReplaceNode(N, N->getOperand(1).getNode());
		return;
		}

		const SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2) };
		SDNode *BFI = CurDAG->getMachineNode(AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0), Ops);
		ReplaceNode(N, BFI);
		return;
		}

case AMDGPUISD::BFE_I32:		case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {		case AMDGPUISD::BFE_U32: {
// There is a scalar version available, but unlike the vector version which		// There is a scalar version available, but unlike the vector version which
// has a separate operand for the offset and width, the scalar version packs		// has a separate operand for the offset and width, the scalar version packs
// the width and offset into a single operand. Try to move to the scalar		// the width and offset into a single operand. Try to move to the scalar
// version if the offsets are constant, so that we can try to keep extended		// version if the offsets are constant, so that we can try to keep extended
// loads of kernel arguments in SGPRs.		// loads of kernel arguments in SGPRs.

▲ Show 20 Lines • Show All 2,377 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Show First 20 Lines • Show All 264 Lines • ▼ Show 20 Lines	def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,		[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;		SDNPMemOperand]>;

def AMDGPUround : SDNode<"ISD::FROUND",		def AMDGPUround : SDNode<"ISD::FROUND",
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;		SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;

def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;		def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;		def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;		def AMDGPUbfi_impl : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;		def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;

def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;		def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;		def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;

def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;		def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;

// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore		// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
▲ Show 20 Lines • Show All 187 Lines • ▼ Show 20 Lines
def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),		def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),		[(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
(AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;		(AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),		def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),		[(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
(AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;		(AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;

		def AMDGPUbfi : PatFrags<(ops node:$src0, node:$src1, node:$src2),
		[(int_amdgcn_bfi node:$src0, node:$src1, node:$src2),
		(AMDGPUbfi_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1),		def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_fmul_legacy node:$src0, node:$src1),		[(int_amdgcn_fmul_legacy node:$src0, node:$src1),
(AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>;		(AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>;

def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp),		def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp),
[(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$clamp),		[(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$clamp),
(AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$clamp)]>;		(AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$clamp)]>;

def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),		def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),
[(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),		[(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
(AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;		(AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;

def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),		def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),		[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;		(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 4,261 Lines • ▼ Show 20 Lines	case AMDGPU::G_INTRINSIC: {
case Intrinsic::amdgcn_sudot4:		case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:		case Intrinsic::amdgcn_sudot8:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:		case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:		case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:		case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:		case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:		case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:		case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
		case Intrinsic::amdgcn_bfi:
return getDefaultMappingVOP(MI);		return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_sbfe:		case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:		case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))		if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);		return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);		return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_ds_swizzle:		case Intrinsic::amdgcn_ds_swizzle:
case Intrinsic::amdgcn_ds_permute:		case Intrinsic::amdgcn_ds_permute:
▲ Show 20 Lines • Show All 608 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

	Show First 20 Lines • Show All 403 Lines • ▼ Show 20 Lines
	>;			>;

	def : AMDGPUPat <			def : AMDGPUPat <
	(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),			(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
	(BFE_INT_eg $src, (MOV_IMM_I32 (i32 0)), $width)			(BFE_INT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
	>;			>;

	def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",			def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
	[(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],			[(set i32:$dst, (AMDGPUbfi_impl i32:$src0, i32:$src1, i32:$src2))],
	VecALU			VecALU
	>;			>;

	def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i1)),			def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i1)),
	(BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;			(BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
	def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)),			def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)),
	(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;			(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
	def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)),			def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)),
	▲ Show 20 Lines • Show All 453 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//		//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
/// \file		/// \file
/// Custom DAG lowering for SI		/// Custom DAG lowering for SI
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "SIISelLowering.h"		#include "SIISelLowering.h"
#include "AMDGPU.h"		#include "AMDGPU.h"
		#include "AMDGPUISelLowering.h"
#include "AMDGPUInstrInfo.h"		#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"		#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"		#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"		#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"		#include "SIRegisterInfo.h"
#include "llvm/ADT/APInt.h"		#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"		#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
▲ Show 20 Lines • Show All 7,247 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_fdot2:
return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,		return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
Op.getOperand(4));		Op.getOperand(4));
case Intrinsic::amdgcn_fmul_legacy:		case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,		return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_sffbh:		case Intrinsic::amdgcn_sffbh:
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));		return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
		case Intrinsic::amdgcn_bfi:
		return DAG.getNode(AMDGPUISD::BFI, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_sbfe:		case Intrinsic::amdgcn_sbfe:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,		return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_ubfe:		case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,		return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_cvt_pkrtz:		case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cvt_pknorm_i16:		case Intrinsic::amdgcn_cvt_pknorm_i16:
▲ Show 20 Lines • Show All 6,963 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/bfi-intrinsic.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s

				define i32 @v_bfi_simple(i32 %x, i32 %y, i32 %z) {
				; GCN-LABEL: v_bfi_simple:
				; GCN: ; %bb.0: ; %entry
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: s_movk_i32 s4, 0x400
				; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
				; GCN-NEXT: s_setpc_b64 s[30:31]
				entry:
				%bfi = call i32 @llvm.amdgcn.bfi(i32 1024, i32 %x, i32 %y)
				ret i32 %bfi
				}

				define i32 @v_bfi(i32 %x, i32 %y, i32 %z) {
				; GCN-LABEL: v_bfi:
				; GCN: ; %bb.0: ; %entry
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_bfi_b32 v0, 1, v0, v1
				; GCN-NEXT: v_bfi_b32 v0, 8, v0, v2
				; GCN-NEXT: s_setpc_b64 s[30:31]
				entry:
				%bfi = call i32 @llvm.amdgcn.bfi(i32 1, i32 %x, i32 %y)
				%bfi.1 = call i32 @llvm.amdgcn.bfi(i32 8, i32 %bfi, i32 %z)
				ret i32 %bfi.1
				}

				define i32 @v_bfi_zero_mask(i32 %x, i32 %y, i32 %z) {
				; GCN-LABEL: v_bfi_zero_mask:
				; GCN: ; %bb.0: ; %entry
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_mul_lo_u32 v0, v1, v2
				; GCN-NEXT: s_setpc_b64 s[30:31]
				entry:
				%bfi = call i32 @llvm.amdgcn.bfi(i32 0, i32 %x, i32 %y)
				%ret = mul i32 %bfi, %z
				ret i32 %ret
				}

				define i32 @v_bfi_minus_one_mask(i32 %x, i32 %y, i32 %z) {
				; GCN-LABEL: v_bfi_minus_one_mask:
				; GCN: ; %bb.0: ; %entry
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
				; GCN-NEXT: s_setpc_b64 s[30:31]
				entry:
				%bfi = call i32 @llvm.amdgcn.bfi(i32 -1, i32 %x, i32 %y)
				%ret = mul i32 %bfi, %z
				ret i32 %ret
				}

				define i32 @v_bfi_non_const_mask(i32 %x, i32 %y, i32 %z, i32 %mask) {
				; GCN-LABEL: v_bfi_non_const_mask:
				; GCN: ; %bb.0: ; %entry
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_bfi_b32 v0, v3, v0, v1
				; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
				; GCN-NEXT: s_setpc_b64 s[30:31]
				entry:
				%bfi = call i32 @llvm.amdgcn.bfi(i32 %mask, i32 %x, i32 %y)
				%ret = mul i32 %bfi, %z
				ret i32 %ret
				}

				declare i32 @llvm.amdgcn.bfi(i32, i32, i32)

llvm/test/CodeGen/AMDGPU/bfi_nested.ll

This file was deleted.

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s

	arsenmUnsubmitted Not Done Reply Inline Actions Why lose a test arsenm: Why lose a test
	tsymallaAuthorUnsubmitted Done Reply Inline Actions Because that was implemented as base for the bementioned, abandoned BFI patch which was never merged. tsymalla: Because that was implemented as base for the bementioned, abandoned BFI patch which was never…
	define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_single_nesting_level:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v1, v1, v2
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.insert = shl i32 %y.i32, 10
	%bfi1.and = and i32 %shl.inner.insert, 1047552
	%bfi1.andnot = and i32 %mul.base.i32, -1073740801
	%bfi1.or = or i32 %bfi1.and, %bfi1.andnot
	%mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
	%shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
	%and.outer = and i32 %shl.outer.insert, 1072693248
	%or.outer = or i32 %bfi1.or, %and.outer
	%result = bitcast i32 %or.outer to float
	ret float %result
	}

	define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_single_nesting_level_swapped_operands:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v1, v1, v2
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_or_b32_e32 v0, v0, v1
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.insert = shl i32 %y.i32, 10
	%bfi1.and = and i32 1047552, %shl.inner.insert
	%bfi1.andnot = and i32 -1073740801, %mul.base.i32
	%bfi1.or = or i32 %bfi1.and, %bfi1.andnot
	%mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
	%shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
	%and.outer = and i32 %shl.outer.insert, 1072693248
	%or.outer = or i32 %and.outer, %bfi1.or
	%result = bitcast i32 %or.outer to float
	ret float %result
	}

	define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_single_nesting_level_unbalanced_subtree:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v1, v1, v3
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2
	; GCN-NEXT: v_or_b32_e32 v1, v2, v1
	; GCN-NEXT: v_or_b32_e32 v0, v0, v1
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.2.insert = shl i32 %y.i32, 10
	%bfi.inner.2.and.1 = and i32 %shl.inner.2.insert, 1047552
	%bfi.inner.2.and.2 = and i32 %mul.base.i32, 992
	%bfi.inner.2 = or i32 %bfi.inner.2.and.1, %bfi.inner.2.and.2
	%mul.inner.1.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.inner.1.insert.1.i32 = fptoui float %mul.inner.1.insert to i32
	%shl.inner.1.insert.1 = shl i32 %mul.inner.1.insert.1.i32, 20
	%bfi.inner.1.and.1 = and i32 %shl.inner.1.insert.1, 1072693248
	%bfi.inner.1.and.2 = and i32 %mul.base.i32, -1073741793
	%bfi.inner.1 = or i32 %bfi.inner.1.and.2, %bfi.inner.2
	%bfi.outer = or i32 %bfi.inner.1.and.1, %bfi.inner.1
	%result = bitcast i32 %bfi.outer to float
	ret float %result
	}

	define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_single_nesting_level_inner_use:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_and_b32_e32 v0, 0x400003ff, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.insert = shl i32 %y.i32, 10
	%bfi1.and = and i32 %shl.inner.insert, 1047552
	%bfi1.andnot = and i32 %mul.base.i32, -1073740801
	%bfi1.or = or i32 %bfi1.and, %bfi1.andnot
	%mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
	%shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
	%and.outer = and i32 %shl.outer.insert, 1072693248
	%or.outer = or i32 %bfi1.or, %and.outer
	%bfi1.or.seconduse = mul i32 %bfi1.or, 2
	%result = bitcast i32 %bfi1.or.seconduse to float
	ret float %result
	}

	define float @v_bfi_no_nesting(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_no_nesting:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_and_b32_e32 v2, 0xc0000400, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v1, v1, v2
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.insert = shl i32 %y.i32, 10
	%inner.and = and i32 %shl.inner.insert, 1047552
	%inner.and2 = and i32 %mul.base.i32, -1073740800
	%inner.or = or i32 %inner.and, %inner.and2
	%mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
	%shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
	%and.outer = and i32 %shl.outer.insert, 1072693248
	%or.outer = or i32 %inner.or, %and.outer
	%result = bitcast i32 %or.outer to float
	ret float %result
	}

	define float @v_bfi_two_levels(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_two_levels:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_lshlrev_b32_e32 v3, 5, v1
	; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v3
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v2, v3, v2
	; GCN-NEXT: v_or_b32_e32 v1, v2, v1
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%y.i32 = fptoui float %y to i32
	%shl.insert.inner = shl i32 %y.i32, 5
	%and.insert.inner = and i32 %shl.insert.inner, 992
	%z.i32 = fptoui float %z to i32
	%base.inner = and i32 %z.i32, -1073741793
	%or.inner = or i32 %and.insert.inner , %base.inner
	%shl.insert.mid = shl i32 %y.i32, 10
	%and.insert.mid = and i32 %shl.insert.mid, 1047552
	%or.mid = or i32 %or.inner, %and.insert.mid
	%fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%cast.insert.outer = fptoui float %fmul.insert.outer to i32
	%shl.insert.outer = shl i32 %cast.insert.outer, 20
	%and.insert.outer = and i32 %shl.insert.outer, 1072693248
	%or.outer = or i32 %or.mid, %and.insert.outer
	%result = bitcast i32 %or.outer to float
	ret float %result
	}

	define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_two_levels_inner_or_multiple_uses:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_lshlrev_b32_e32 v3, 5, v1
	; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v3
	; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v2, v3, v2
	; GCN-NEXT: v_or_b32_e32 v1, v2, v1
	; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%y.i32 = fptoui float %y to i32
	%shl.insert.inner = shl i32 %y.i32, 5
	%and.insert.inner = and i32 %shl.insert.inner, 992
	%z.i32 = fptoui float %z to i32
	%base.inner = and i32 %z.i32, -1073741793
	%or.inner = or i32 %and.insert.inner , %base.inner
	%shl.insert.mid = shl i32 %y.i32, 10
	%and.insert.mid = and i32 %shl.insert.mid, 1047552
	%or.mid = or i32 %or.inner, %and.insert.mid
	%fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%cast.insert.outer = fptoui float %fmul.insert.outer to i32
	%shl.insert.outer = shl i32 %cast.insert.outer, 20
	%and.insert.outer = and i32 %shl.insert.outer, 1072693248
	%or.outer = or i32 %or.mid, %and.insert.outer
	%result = bitcast i32 %or.outer to float
	%or.inner.float = bitcast i32 %or.inner to float
	%result2 = fmul float %result, %or.inner.float
	ret float %result2
	}

	define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) {
	; GCN-LABEL: v_bfi_single_constant_as_partition:
	; GCN: ; %bb.0: ; %.entry
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2
	; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
	; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0
	; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
	; GCN-NEXT: v_or_b32_e32 v1, v1, v2
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0
	; GCN-NEXT: v_or_b32_e32 v0, v1, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]
	.entry:
	%mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
	%mul.base.i32 = fptoui float %mul.base to i32
	%y.i32 = fptoui float %y to i32
	%shl.inner.insert = shl i32 %y.i32, 10
	%bfi1.or = or i32 %shl.inner.insert, %mul.base.i32
	%mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
	%mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
	%shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
	%and.outer = and i32 %shl.outer.insert, -1
	%or.outer = or i32 %bfi1.or, %and.outer
	%result = bitcast i32 %or.outer to float
	ret float %result
	}

	define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) {
	; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops:
	; GCN: ; %bb.0:
	; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
	; GCN-NEXT: s_mov_b32 s7, 0xf000
	; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000
	; GCN-NEXT: s_and_b32 s2, s2, 0xffff
	; GCN-NEXT: s_or_b32 s2, s2, s3
	; GCN-NEXT: s_mov_b32 s6, -1
	; GCN-NEXT: s_mov_b32 s4, s0
	; GCN-NEXT: s_mov_b32 s5, s1
	; GCN-NEXT: v_mov_b32_e32 v0, s2
	; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
	; GCN-NEXT: s_endpgm
	%shift = lshr i32 %b, 16
	%tr = trunc i32 %shift to i16
	%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
	%vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
	%val = bitcast <2 x i16> %vec to i32
	store i32 %val, ptr addrspace(1) %out, align 4
	ret void
	}