Diff 521223

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Show All 12 Lines
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "llvm/Analysis/AssumptionCache.h"		#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"		#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
		#include "llvm/IR/Instructions.h"
		#include "llvm/IR/IntrinsicInst.h"
		#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"		#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"		#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"

#define DEBUG_TYPE "amdgpu-late-codegenprepare"		#define DEBUG_TYPE "amdgpu-late-codegenprepare"

using namespace llvm;		using namespace llvm;
Show All 13 Lines
class AMDGPULateCodeGenPrepare		class AMDGPULateCodeGenPrepare
: public FunctionPass,		: public FunctionPass,
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {		public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
Module *Mod = nullptr;		Module *Mod = nullptr;
const DataLayout *DL = nullptr;		const DataLayout *DL = nullptr;

AssumptionCache *AC = nullptr;		AssumptionCache *AC = nullptr;
UniformityInfo *UA = nullptr;		UniformityInfo *UA = nullptr;
		DominatorTree *DT = nullptr;

public:		public:
static char ID;		static char ID;

AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}		AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}

StringRef getPassName() const override {		StringRef getPassName() const override {
return "AMDGPU IR late optimizations";		return "AMDGPU IR late optimizations";
Show All 13 Lines	public:
// Check if the specified value is at least DWORD aligned.		// Check if the specified value is at least DWORD aligned.
bool isDWORDAligned(const Value *V) const {		bool isDWORDAligned(const Value *V) const {
KnownBits Known = computeKnownBits(V, *DL, 0, AC);		KnownBits Known = computeKnownBits(V, *DL, 0, AC);
return Known.countMinTrailingZeros() >= 2;		return Known.countMinTrailingZeros() >= 2;
}		}

bool canWidenScalarExtLoad(LoadInst &LI) const;		bool canWidenScalarExtLoad(LoadInst &LI) const;
bool visitLoadInst(LoadInst &LI);		bool visitLoadInst(LoadInst &LI);
		bool visitIntrinsicInst(IntrinsicInst &I);
		bool cloneInstructionToUsers(Instruction *I);
		bool hasUndefOrPoisonOperand(IntrinsicInst &I, unsigned Operand = 0) const;
};		};

} // end anonymous namespace		} // end anonymous namespace

bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {		bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;		Mod = &M;
DL = &Mod->getDataLayout();		DL = &Mod->getDataLayout();
return false;		return false;
}		}

bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {		bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))		if (skipFunction(F))
return false;		return false;

AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);		AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();		UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

		auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
		DT = DTWP ? &DTWP->getDomTree() : nullptr;

bool Changed = false;		bool Changed = false;
for (auto &BB : F)		for (auto &BB : F)
for (Instruction &I : llvm::make_early_inc_range(BB))		for (Instruction &I : llvm::make_early_inc_range(BB))
Changed \|= visit(I);		Changed \|= visit(I);

return Changed;		return Changed;
}		}

▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
auto *NewVal = IRB.CreateBitCast(		auto *NewVal = IRB.CreateBitCast(
IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());		IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
LI.replaceAllUsesWith(NewVal);		LI.replaceAllUsesWith(NewVal);
RecursivelyDeleteTriviallyDeadInstructions(&LI);		RecursivelyDeleteTriviallyDeadInstructions(&LI);

return true;		return true;
}		}

		bool AMDGPULateCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
		switch (I.getIntrinsicID()) {
		case Intrinsic::fabs:
		return cloneInstructionToUsers(&I);
		default:
		return false;
		}
		}

		/**
		* This function enables the SelectionDAG to use some modifiers more often.
		* In some cases, like after inlining a function which introduces a call to
		* the llvm.fabs intrinsic, the IR ends up with users which reside outside
		* of the basic block. The generated code then will be suboptimal because
		* instruction selection cannot apply the abs modifier.
		* In this function, all users of a given instruction are recorded if
		* they don't reside in the same BB, and are dominated by the intrinsic call.
		* A unique clone of the intrinsic call will be inserted at the earliest point
		* in the BB and then all recorded users are adjusted to use the clone instead.
		* For now, this is only implemented for the fabs intrinsic.
		* It appears that this already works for fneg instructions.
		* NOTE: This could be adjusted to move intrinsic calls to their use if there is
		* only a single user outside of the caller BB.
		*
		* @param I The instruction to clone
		* @return Whether something has changed or not.
		*/
		bool AMDGPULateCodeGenPrepare::cloneInstructionToUsers(Instruction *I) {
		using namespace PatternMatch;

		bool Changed = false;
		IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I);

		// Early opt-out as we don't want to operate on undef or poison values.
		if (Intrinsic && hasUndefOrPoisonOperand(*Intrinsic))
		return false;
		foadUnsubmitted Not Done Reply Inline Actions Why do you need this special case at all? And why only for intrinsics? foad: Why do you need this special case at all? And why only for intrinsics?
		tsymallaAuthorUnsubmitted Done Reply Inline Actions We don't really need to handle it, but it is useless to clone (for instance) an fabs call with an undef operand. tsymalla: We don't really need to handle it, but it is useless to clone (for instance) an fabs call with…
		foadUnsubmitted Done Reply Inline Actions Any useless case should have been optimized away already, so it is much cleaner if you do not add code to handle it specially here. foad: Any useless case should have been optimized away already, so it is much cleaner if you do not…

		DenseMap<BasicBlock , SmallVector<Instruction , 1>> UsersPerBB;
		bool CanErase = true;

		// Filter all users which we don't want to operate on, keep the rest.
		for (User *U : I->users()) {
		Instruction *UI = cast<Instruction>(U);
		if (!UI)
		continue;
		foadUnsubmitted Done Reply Inline Actions All users of Instructions are Instructions, so this cast will always succeed. (And anyway if you want a cast that can fail, you need to use `dyn_cast`.) foad: All users of Instructions are Instructions, so this cast will always succeed. (And anyway if…
		tsymallaAuthorUnsubmitted Done Reply Inline Actions Thanks. I have overlooked that one. tsymalla: Thanks. I have overlooked that one.

		// Don't do anything if there is a user in the same BB.
		if (UI->getParent() == I->getParent()) {
		CanErase = false;
		continue;
		}

		// Ignore complex control flow for now.
		const bool IsPhi = isa<PHINode>(UI);

		// Removing the original instruction will cause badref for
		// phi nodes, so ignore these cases as well.
		if (IsPhi \|\| !DT \|\| !DT->dominates(I, UI)) {
		foadUnsubmitted Done Reply Inline Actions Surely the "dominates" test can never fail, since IR uses SSA form? foad: Surely the "dominates" test can never fail, since IR uses SSA form?
		CanErase = false;
		continue;
		}

		if (Intrinsic) {
		// Don't generate fabs(fabs(x)) calls.
		foadUnsubmitted Done Reply Inline Actions I don't think you need a special case for this. Any fabs(fabs(x)) - even in different basic blocks - should already have been simplified by generic IR optimizations. foad: I don't think you need a special case for this. Any fabs(fabs(x)) - even in different basic…
		tsymallaAuthorUnsubmitted Done Reply Inline Actions Makes sense. Thanks. tsymalla: Makes sense. Thanks.
		if (auto *UII = dyn_cast<IntrinsicInst>(UI);
		UII && (UII->getIntrinsicID() == Intrinsic->getIntrinsicID()))
		continue;
		}

		// Record the user so we can process on it later.
		UsersPerBB[UI->getParent()].push_back(UI);
		}

		DenseMap<BasicBlock , Instruction > ClonesPerBB;

		// Generate one clone for each BB.
		for (auto &[BB, Users] : UsersPerBB) {
		Instruction *Clone = nullptr;
		if (ClonesPerBB.contains(BB)) {
		Clone = ClonesPerBB[BB];
		} else {
		Clone = I->clone();
		Clone->setName(I->getName() + Twine(".clone"));
		Clone->insertBefore(&*BB->getFirstInsertionPt());
		ClonesPerBB[BB] = Clone;
		}

		// Adjust the users so they use the BB-local clone.
		for (Instruction *UI : Users) {
		UI->replaceUsesOfWith(I, Clone);
		Changed = true;
		}
		}

		if (Changed && CanErase)
		I->eraseFromParent();

		return Changed;
		}

		bool AMDGPULateCodeGenPrepare::hasUndefOrPoisonOperand(IntrinsicInst &I,
		unsigned Operand) const {
		using PatternMatch::m_Poison;
		using PatternMatch::m_Undef;

		return match(I.getOperand(Operand), m_Undef()) \|\|
		match(I.getOperand(Operand), m_Poison());
		}

INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,		INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)		"AMDGPU IR late optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)		INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)		INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,		INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)		"AMDGPU IR late optimizations", false, false)

char AMDGPULateCodeGenPrepare::ID = 0;		char AMDGPULateCodeGenPrepare::ID = 0;

FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {		FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
return new AMDGPULateCodeGenPrepare();		return new AMDGPULateCodeGenPrepare();
}		}

llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s \| FileCheck %s -check-prefix=ISA			; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s \| FileCheck %s -check-prefix=ISA
	; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s \| FileCheck %s -check-prefix=MIR			; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s \| FileCheck %s -check-prefix=MIR

	define void @f(i32 %arg, ptr %ptr) {			define void @f(i32 %arg, ptr %ptr) {
	; ISA-LABEL: f:			; ISA-LABEL: f:
	; ISA: ; %bb.0: ; %bb			; ISA: ; %bb.0: ; %bb
	; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; ISA-NEXT: s_waitcnt_vscnt null, 0x0			; ISA-NEXT: s_waitcnt_vscnt null, 0x0
	; ISA-NEXT: s_mov_b64 s[4:5], 0			; ISA-NEXT: s_mov_b64 s[4:5], 0
	; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0			; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0
	; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0			; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; ISA-NEXT: v_mov_b32_e32 v7, 0			; ISA-NEXT: v_mov_b32_e32 v7, 0
	; ISA-NEXT: s_waitcnt lgkmcnt(0)			; ISA-NEXT: s_waitcnt lgkmcnt(0)
	; ISA-NEXT: s_cmp_lg_u32 s4, 0			; ISA-NEXT: s_lshr_b32 s6, s5, 1
	; ISA-NEXT: s_cselect_b32 s6, -1, 0
	; ISA-NEXT: s_and_b32 s6, s6, exec_lo
	; ISA-NEXT: s_cselect_b32 s6, s5, 0
	; ISA-NEXT: s_lshr_b32 s7, 1, s4			; ISA-NEXT: s_lshr_b32 s7, 1, s4
	; ISA-NEXT: s_cmp_lg_u32 s4, 0			; ISA-NEXT: s_cmp_lg_u32 s4, 0
	; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6
	; ISA-NEXT: s_cselect_b32 s8, -1, 0
	; ISA-NEXT: s_and_b32 s8, s8, exec_lo
	; ISA-NEXT: s_cselect_b32 s7, s7, 0
	; ISA-NEXT: s_lshr_b32 s5, s5, 1
	; ISA-NEXT: s_cmp_lg_u32 s4, 0
	; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7
	; ISA-NEXT: s_cselect_b32 s4, -1, 0			; ISA-NEXT: s_cselect_b32 s4, -1, 0
	; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4			; ISA-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
	; ISA-NEXT: s_and_b32 s4, s4, exec_lo			; ISA-NEXT: s_and_b32 s4, s4, exec_lo
	; ISA-NEXT: s_cselect_b32 s4, s5, 0			; ISA-NEXT: s_cselect_b32 s4, s6, 0
	; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4			; ISA-NEXT: s_cselect_b32 s6, s7, 0
				; ISA-NEXT: s_cselect_b32 s5, s5, 0
				; ISA-NEXT: v_cvt_f32_i32_e32 v3, s4
				; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6
				; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5
	; ISA-NEXT: s_mov_b32 s4, 0			; ISA-NEXT: s_mov_b32 s4, 0
	; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
	; ISA-NEXT: .LBB0_1: ; %bb14			; ISA-NEXT: .LBB0_1: ; %bb14
	; ISA-NEXT: ; =>This Inner Loop Header: Depth=1			; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
	; ISA-NEXT: v_mov_b32_e32 v6, v7			; ISA-NEXT: v_mov_b32_e32 v6, v7
	; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo			; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo
	; ISA-NEXT: s_or_b32 s4, s5, s4			; ISA-NEXT: s_or_b32 s4, s5, s4
	; ISA-NEXT: v_add_f32_e32 v7, v6, v3			; ISA-NEXT: v_add_f32_e32 v7, v6, v0
	; ISA-NEXT: v_add_f32_e32 v7, v7, v5			; ISA-NEXT: v_add_f32_e64 v7, v7, \|v3\|
	; ISA-NEXT: v_add_f32_e32 v7, v7, v4			; ISA-NEXT: v_add_f32_e32 v7, v7, v4
	; ISA-NEXT: v_add_f32_e32 v7, v7, v0			; ISA-NEXT: v_add_f32_e32 v7, v7, v5
	; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4			; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
	; ISA-NEXT: s_cbranch_execnz .LBB0_1			; ISA-NEXT: s_cbranch_execnz .LBB0_1
	; ISA-NEXT: ; %bb.2: ; %bb21			; ISA-NEXT: ; %bb.2: ; %bb21
	; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4			; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4
	; ISA-NEXT: flat_store_dword v[1:2], v6			; ISA-NEXT: flat_store_dword v[1:2], v6
	; ISA-NEXT: s_waitcnt lgkmcnt(0)			; ISA-NEXT: s_waitcnt lgkmcnt(0)
	; ISA-NEXT: s_waitcnt_vscnt null, 0x0			; ISA-NEXT: s_waitcnt_vscnt null, 0x0
	; ISA-NEXT: s_setpc_b64 s[30:31]			; ISA-NEXT: s_setpc_b64 s[30:31]
	; MIR-LABEL: name: f			; MIR-LABEL: name: f
	; MIR: bb.0.bb:			; MIR: bb.0.bb:
	; MIR-NEXT: successors: %bb.1(0x80000000)			; MIR-NEXT: successors: %bb.1(0x80000000)
	; MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2			; MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
	; MIR-NEXT: {{ $}}			; MIR-NEXT: {{ $}}
	; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2			; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
	; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1			; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
	; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0			; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
				; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
				; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
	; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0			; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
	; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4)			; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4)
	; MIR-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1			; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
	; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0			; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
	; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0			; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
	; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc			; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
	; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc			; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc
	; MIR-NEXT: $scc = COPY [[COPY5]]			; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
	; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc			; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc
	; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
	; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc
	; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
	; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc			; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
	; MIR-NEXT: $scc = COPY [[COPY6]]			; MIR-NEXT: $scc = COPY [[COPY6]]
	; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc			; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc
	; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc			; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc			; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]]
	; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc			; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
	; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1			; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
	; MIR-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]			; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
	; MIR-NEXT: $scc = COPY [[COPY7]]			; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec
	; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc			; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
	; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: $scc = COPY [[COPY6]]
	; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647			; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc
	; MIR-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]]
	; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc
	; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
	; MIR-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
	; MIR-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]]
	; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec
	; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
	; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec			; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec
	; MIR-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]			; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
	; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: $scc = COPY [[COPY6]]
	; MIR-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]			; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc
	; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec			; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]			; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
				; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
				; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
	; MIR-NEXT: {{ $}}			; MIR-NEXT: {{ $}}
	; MIR-NEXT: bb.1.bb14:			; MIR-NEXT: bb.1.bb14:
	; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)			; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
	; MIR-NEXT: {{ $}}			; MIR-NEXT: {{ $}}
	; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1			; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1
	; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1			; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1
	; MIR-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]]			; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]]
	; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc			; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc
	; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec			; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
	; MIR-NEXT: [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]			; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
	; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec			; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
	; MIR-NEXT: S_BRANCH %bb.2			; MIR-NEXT: S_BRANCH %bb.2
	; MIR-NEXT: {{ $}}			; MIR-NEXT: {{ $}}
	; MIR-NEXT: bb.2.bb21:			; MIR-NEXT: bb.2.bb21:
	; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1			; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1
	; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1			; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1
	; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec			; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
	; MIR-NEXT: FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)			; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
	; MIR-NEXT: SI_RETURN			; MIR-NEXT: SI_RETURN
	bb:			bb:
	%i = load <2 x i32>, ptr addrspace(4) null, align 4294967296			%i = load <2 x i32>, ptr addrspace(4) null, align 4294967296
	%i1 = extractelement <2 x i32> %i, i64 1			%i1 = extractelement <2 x i32> %i, i64 1
	%i2 = extractelement <2 x i32> %i, i64 0			%i2 = extractelement <2 x i32> %i, i64 0
	%i3 = lshr i32 %i1, 1			%i3 = lshr i32 %i1, 1
	%i4 = icmp ne i32 %i2, 0			%i4 = icmp ne i32 %i2, 0
	%i5 = select i1 %i4, i32 %i3, i32 0			%i5 = select i1 %i4, i32 %i3, i32 0
	Show All 25 Lines

llvm/test/CodeGen/AMDGPU/fold-fabs.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
	; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 -start-before=amdgpu-late-codegenprepare < %s \| FileCheck -check-prefix=GFX10 %s			; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 -start-before=amdgpu-late-codegenprepare < %s \| FileCheck -check-prefix=GFX10 %s

	define float @fold_abs_in_branch(float %arg1, float %arg2) {			define float @fold_abs_in_branch(float %arg1, float %arg2) {
	; GFX10-LABEL: fold_abs_in_branch:			; GFX10-LABEL: fold_abs_in_branch:
	; GFX10: ; %bb.0: ; %entry			; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_add_f32_e32 v0, v0, v1			; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
	; GFX10-NEXT: s_mov_b32 s4, exec_lo			; GFX10-NEXT: s_mov_b32 s4, exec_lo
	; GFX10-NEXT: v_add_f32_e32 v1, v0, v1			; GFX10-NEXT: v_add_f32_e32 v1, v0, v1
	; GFX10-NEXT: v_add_f32_e64 v0, \|v1\|, \|v1\|			; GFX10-NEXT: v_add_f32_e64 v0, \|v1\|, \|v1\|
	; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
	; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0			; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
	; GFX10-NEXT: ; %bb.1: ; %if			; GFX10-NEXT: ; %bb.1: ; %if
	; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1			; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, \|v1\|
	; GFX10-NEXT: ; %bb.2: ; %exit			; GFX10-NEXT: ; %bb.2: ; %exit
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4			; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	entry:			entry:
	%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2			%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
	%1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2			%1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2
	%2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1)			%2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1)
	%3 = fmul reassoc nnan nsz arcp contract afn float %2, 2.000000e+00			%3 = fmul reassoc nnan nsz arcp contract afn float %2, 2.000000e+00
	Show All 11 Lines

	define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) {			define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) {
	; GFX10-LABEL: fold_abs_in_branch_multiple_users:			; GFX10-LABEL: fold_abs_in_branch_multiple_users:
	; GFX10: ; %bb.0: ; %entry			; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_add_f32_e32 v0, v0, v1			; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
	; GFX10-NEXT: s_mov_b32 s4, exec_lo			; GFX10-NEXT: s_mov_b32 s4, exec_lo
	; GFX10-NEXT: v_add_f32_e32 v1, v0, v1			; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
	; GFX10-NEXT: v_add_f32_e64 v0, \|v1\|, \|v1\|			; GFX10-NEXT: v_add_f32_e64 v1, \|v0\|, \|v0\|
	; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1			; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1
	; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
	; GFX10-NEXT: ; %bb.1: ; %if			; GFX10-NEXT: ; %bb.1: ; %if
	; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1			; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, \|v0\|
	; GFX10-NEXT: ; %bb.2: ; %exit			; GFX10-NEXT: ; %bb.2: ; %exit
	; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4			; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
	; GFX10-NEXT: v_add_f32_e32 v1, 2.0, v1			; GFX10-NEXT: v_add_f32_e64 v0, \|v0\|, 2.0
	; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0			; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	entry:			entry:
	%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2			%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
	%1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2			%1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2
	%2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1)			%2 = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1)
	%3 = fmul reassoc nnan nsz arcp contract afn float %2, 2.000000e+00			%3 = fmul reassoc nnan nsz arcp contract afn float %2, 2.000000e+00
	%4 = fcmp ule float %3, 1.000000e+00			%4 = fcmp ule float %3, 1.000000e+00
	br i1 %4, label %if, label %exit			br i1 %4, label %if, label %exit
	▲ Show 20 Lines • Show All 152 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve abs modifier usage
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 521223

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll

llvm/test/CodeGen/AMDGPU/fold-fabs.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve abs modifier usageClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 521223

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll

llvm/test/CodeGen/AMDGPU/fold-fabs.ll

[AMDGPU] Improve abs modifier usage
ClosedPublic