This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] gfx1010 wavefrontsize intrinsic folding
ClosedPublic

Authored by rampitec on Jun 12 2019, 7:47 AM.

Download Raw Diff

Details

Reviewers

kzhuravl
msearles

Commits

rGa9191c8492ab: [AMDGPU] gfx1010 wavefrontsize intrinsic folding
rL363588: [AMDGPU] gfx1010 wavefrontsize intrinsic folding

Diff Detail

Repository: rL LLVM

Event Timeline

rampitec created this revision.Jun 12 2019, 7:47 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptJun 12 2019, 7:47 AM

rampitec added a parent revision: D63204: [AMDGPU] gfx1010 core wave32 changes.Jun 12 2019, 7:48 AM

I don't see codegen for this, so this looks like depending on an optimization pass for correctness

lib/Target/AMDGPU/AMDGPULibCalls.cpp
1392–1414 ↗	(On Diff #204290)	This doesn't really belong in this pass
test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
7–8 ↗	(On Diff #204290)	Capitalized feature name is wrong? Also shouldn't be using opt -O3

In D63206#1539987, @arsenm wrote:

I don't see codegen for this, so this looks like depending on an optimization pass for correctness

It is in the parent patch, inside ISelLowering. Optimization is only to fold it earlier.

arsenm added inline comments.Jun 12 2019, 8:26 AM

lib/Target/AMDGPU/AMDGPULibCalls.cpp
1392–1414 ↗	(On Diff #204290)	This might make more sense In AMDGPULowerIntrinsics?

rampitec marked an inline comment as done.Jun 12 2019, 8:31 AM

rampitec added inline comments.

lib/Target/AMDGPU/AMDGPULibCalls.cpp
1392–1414 ↗	(On Diff #204290)	The point is to give folding a chance as early as possible. It also belongs to libcalls because library is the primary user.

lgtm

This revision is now accepted and ready to land.Jun 13 2019, 5:17 PM

Rebased.

rampitec removed a parent revision: D63204: [AMDGPU] gfx1010 core wave32 changes.Jun 17 2019, 10:53 AM

Closed by commit rL363588: [AMDGPU] gfx1010 wavefrontsize intrinsic folding (authored by rampitec). · Explain WhyJun 17 2019, 10:57 AM

This revision was automatically updated to reflect the committed changes.

Herald added a project: Restricted Project. · View Herald TranscriptJun 17 2019, 10:57 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

AMDGPU.h

3 lines

AMDGPULibCalls.cpp

70 lines

AMDGPUTargetMachine.cpp

2 lines

test/

CodeGen/

AMDGPU/

llvm.amdgcn.wavefrontsize.ll

84 lines

Diff 205125

llvm/trunk/lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines
	FunctionPass *createSIWholeQuadModePass();			FunctionPass *createSIWholeQuadModePass();
	FunctionPass *createSIFixControlFlowLiveIntervalsPass();			FunctionPass *createSIFixControlFlowLiveIntervalsPass();
	FunctionPass *createSIOptimizeExecMaskingPreRAPass();			FunctionPass *createSIOptimizeExecMaskingPreRAPass();
	FunctionPass *createSIFixSGPRCopiesPass();			FunctionPass *createSIFixSGPRCopiesPass();
	FunctionPass *createSIMemoryLegalizerPass();			FunctionPass *createSIMemoryLegalizerPass();
	FunctionPass *createSIInsertWaitcntsPass();			FunctionPass *createSIInsertWaitcntsPass();
	FunctionPass *createSIPreAllocateWWMRegsPass();			FunctionPass *createSIPreAllocateWWMRegsPass();
	FunctionPass *createSIFormMemoryClausesPass();			FunctionPass *createSIFormMemoryClausesPass();
	FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);			FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
				const TargetMachine *);
	FunctionPass *createAMDGPUUseNativeCallsPass();			FunctionPass *createAMDGPUUseNativeCallsPass();
	FunctionPass *createAMDGPUCodeGenPreparePass();			FunctionPass *createAMDGPUCodeGenPreparePass();
	FunctionPass *createAMDGPUMachineCFGStructurizerPass();			FunctionPass *createAMDGPUMachineCFGStructurizerPass();
	FunctionPass createAMDGPUPropagateAttributesEarlyPass(const TargetMachine );			FunctionPass createAMDGPUPropagateAttributesEarlyPass(const TargetMachine );
	ModulePass createAMDGPUPropagateAttributesLatePass(const TargetMachine );			ModulePass createAMDGPUPropagateAttributesLatePass(const TargetMachine );
	FunctionPass *createAMDGPURewriteOutArgumentsPass();			FunctionPass *createAMDGPURewriteOutArgumentsPass();
	FunctionPass *createSIModeRegisterPass();			FunctionPass *createSIModeRegisterPass();

	▲ Show 20 Lines • Show All 242 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp

Show All 9 Lines
/// This file does AMD library function optimizations.		/// This file does AMD library function optimizations.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "amdgpu-simplifylib"		#define DEBUG_TYPE "amdgpu-simplifylib"

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPULibFunc.h"		#include "AMDGPULibFunc.h"
		#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"		#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"		#include "llvm/Analysis/Loads.h"
#include "llvm/ADT/StringSet.h"		#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"		#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
		#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"		#include "llvm/Target/TargetOptions.h"
#include <vector>		#include <vector>
#include <cmath>		#include <cmath>

using namespace llvm;		using namespace llvm;

static cl::opt<bool> EnablePreLink("amdgpu-prelink",		static cl::opt<bool> EnablePreLink("amdgpu-prelink",
cl::desc("Enable pre-link mode optimizations"),		cl::desc("Enable pre-link mode optimizations"),
Show All 20 Lines

namespace llvm {		namespace llvm {

class AMDGPULibCalls {		class AMDGPULibCalls {
private:		private:

typedef llvm::AMDGPULibFunc FuncInfo;		typedef llvm::AMDGPULibFunc FuncInfo;

		const TargetMachine *TM;

// -fuse-native.		// -fuse-native.
bool AllNative = false;		bool AllNative = false;

bool useNativeFunc(const StringRef F) const;		bool useNativeFunc(const StringRef F) const;

// Return a pointer (pointer expr) to the function if function defintion with		// Return a pointer (pointer expr) to the function if function defintion with
// "FuncName" exists. It may create a new function prototype in pre-link mode.		// "FuncName" exists. It may create a new function prototype in pre-link mode.
FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);		FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	private:
bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);		bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);

// sin/cos		// sin/cos
bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);		bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);

// __read_pipe/__write_pipe		// __read_pipe/__write_pipe
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);		bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);

		// llvm.amdgcn.wavefrontsize
		bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);

// Get insertion point at entry.		// Get insertion point at entry.
BasicBlock::iterator getEntryIns(CallInst * UI);		BasicBlock::iterator getEntryIns(CallInst * UI);
// Insert an Alloc instruction.		// Insert an Alloc instruction.
AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);		AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
// Get a scalar native builtin signle argument FP function		// Get a scalar native builtin signle argument FP function
FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);		FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);

protected:		protected:
CallInst *CI;		CallInst *CI;

bool isUnsafeMath(const CallInst *CI) const;		bool isUnsafeMath(const CallInst *CI) const;

void replaceCall(Value *With) {		void replaceCall(Value *With) {
CI->replaceAllUsesWith(With);		CI->replaceAllUsesWith(With);
CI->eraseFromParent();		CI->eraseFromParent();
}		}

public:		public:
		AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}

bool fold(CallInst CI, AliasAnalysis AA = nullptr);		bool fold(CallInst CI, AliasAnalysis AA = nullptr);

void initNativeFuncs();		void initNativeFuncs();

// Replace a normal math function call with that native version		// Replace a normal math function call with that native version
bool useNative(CallInst *CI);		bool useNative(CallInst *CI);
};		};

} // end llvm namespace		} // end llvm namespace

namespace {		namespace {

class AMDGPUSimplifyLibCalls : public FunctionPass {		class AMDGPUSimplifyLibCalls : public FunctionPass {

AMDGPULibCalls Simplifier;

const TargetOptions Options;		const TargetOptions Options;

		AMDGPULibCalls Simplifier;

public:		public:
static char ID; // Pass identification		static char ID; // Pass identification

AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())		AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
: FunctionPass(ID), Options(Opt) {		const TargetMachine *TM = nullptr)
		: FunctionPass(ID), Options(Opt), Simplifier(TM) {
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());		initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();		AU.addRequired<AAResultsWrapperPass>();
}		}

bool runOnFunction(Function &M) override;		bool runOnFunction(Function &M) override;
▲ Show 20 Lines • Show All 448 Lines • ▼ Show 20 Lines
// This function returns false if no change; return true otherwise.		// This function returns false if no change; return true otherwise.
bool AMDGPULibCalls::fold(CallInst CI, AliasAnalysis AA) {		bool AMDGPULibCalls::fold(CallInst CI, AliasAnalysis AA) {
this->CI = CI;		this->CI = CI;
Function *Callee = CI->getCalledFunction();		Function *Callee = CI->getCalledFunction();

// Ignore indirect calls.		// Ignore indirect calls.
if (Callee == 0) return false;		if (Callee == 0) return false;

FuncInfo FInfo;
if (!parseFunctionName(Callee->getName(), &FInfo))
return false;

// Further check the number of arguments to see if they match.
if (CI->getNumArgOperands() != FInfo.getNumArgs())
return false;

BasicBlock *BB = CI->getParent();		BasicBlock *BB = CI->getParent();
LLVMContext &Context = CI->getParent()->getContext();		LLVMContext &Context = CI->getParent()->getContext();
IRBuilder<> B(Context);		IRBuilder<> B(Context);

// Set the builder to the instruction after the call.		// Set the builder to the instruction after the call.
B.SetInsertPoint(BB, CI->getIterator());		B.SetInsertPoint(BB, CI->getIterator());

// Copy fast flags from the original call.		// Copy fast flags from the original call.
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))		if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
B.setFastMathFlags(FPOp->getFastMathFlags());		B.setFastMathFlags(FPOp->getFastMathFlags());

		switch (Callee->getIntrinsicID()) {
		default:
		break;
		case Intrinsic::amdgcn_wavefrontsize:
		return !EnablePreLink && fold_wavefrontsize(CI, B);
		}

		FuncInfo FInfo;
		if (!parseFunctionName(Callee->getName(), &FInfo))
		return false;

		// Further check the number of arguments to see if they match.
		if (CI->getNumArgOperands() != FInfo.getNumArgs())
		return false;

if (TDOFold(CI, FInfo))		if (TDOFold(CI, FInfo))
return true;		return true;

// Under unsafe-math, evaluate calls if possible.		// Under unsafe-math, evaluate calls if possible.
// According to Brian Sumner, we can do this for all f32 function calls		// According to Brian Sumner, we can do this for all f32 function calls
// using host's double function calls.		// using host's double function calls.
if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))		if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
return true;		return true;
▲ Show 20 Lines • Show All 697 Lines • ▼ Show 20 Lines	if (!isSin) { // CI->cos, UI->sin
UI->replaceAllUsesWith(Reload);		UI->replaceAllUsesWith(Reload);
CI->replaceAllUsesWith(Call);		CI->replaceAllUsesWith(Call);
UI->eraseFromParent();		UI->eraseFromParent();
CI->eraseFromParent();		CI->eraseFromParent();
}		}
return true;		return true;
}		}

		bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
		if (!TM)
		return false;

		StringRef CPU = TM->getTargetCPU();
		StringRef Features = TM->getTargetFeatureString();
		if ((CPU.empty() \|\| CPU.equals_lower("generic")) &&
		(Features.empty() \|\|
		Features.find_lower("wavefrontsize") == StringRef::npos))
		return false;

		Function *F = CI->getParent()->getParent();
		const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
		unsigned N = ST.getWavefrontSize();

		LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
		<< N << "\n");

		CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
		CI->eraseFromParent();
		return true;
		}

// Get insertion point at entry.		// Get insertion point at entry.
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {		BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
Function * Func = UI->getParent()->getParent();		Function * Func = UI->getParent()->getParent();
BasicBlock * BB = &Func->getEntryBlock();		BasicBlock * BB = &Func->getEntryBlock();
assert(BB && "Entry block not found!");		assert(BB && "Entry block not found!");
BasicBlock::iterator ItNew = BB->begin();		BasicBlock::iterator ItNew = BB->begin();
return ItNew;		return ItNew;
}		}
▲ Show 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	if (hasTwoResults) {
new StoreInst(nval1, aCI->getArgOperand(1), aCI);		new StoreInst(nval1, aCI->getArgOperand(1), aCI);
}		}

replaceCall(nval0);		replaceCall(nval0);
return true;		return true;
}		}

// Public interface to the Simplify LibCalls pass.		// Public interface to the Simplify LibCalls pass.
FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {		FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
return new AMDGPUSimplifyLibCalls(Opt);		const TargetMachine *TM) {
		return new AMDGPUSimplifyLibCalls(Opt, TM);
}		}

FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {		FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
return new AMDGPUUseNativeCalls();		return new AMDGPUUseNativeCalls();
}		}

static bool setFastFlags(Function &F, const TargetOptions &Options) {		static bool setFastFlags(Function &F, const TargetOptions &Options) {
AttrBuilder B;		AttrBuilder B;
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 426 Lines • ▼ Show 20 Lines	[AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
legacy::PassManagerBase &PM) {		legacy::PassManagerBase &PM) {
if (AMDGPUAA) {		if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());		PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());		PM.add(createAMDGPUExternalAAWrapperPass());
}		}
PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));		PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());		PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)		if (LibCallSimplify)
PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));		PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
});		});

Builder.addExtension(		Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,		PassManagerBuilder::EP_CGSCCOptimizerLate,
[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {		[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
// Add infer address spaces pass to the opt pipeline after inlining		// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.		// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());		PM.add(createInferAddressSpacesPass());
▲ Show 20 Lines • Show All 617 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll

				; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,W64 %s
				; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,W32 %s
				; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,W64 %s

				; RUN: opt -O3 -S < %s \| FileCheck -check-prefixes=OPT,OPT-WXX %s
				; RUN: opt -mtriple=amdgcn-- -O3 -S < %s \| FileCheck -check-prefixes=OPT,OPT-WXX %s
				; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s \| FileCheck -check-prefixes=OPT,OPT-W32 %s
				; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s \| FileCheck -check-prefixes=OPT,OPT-W64 %s
				; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s \| FileCheck -check-prefixes=OPT,OPT-W64 %s
				; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s \| FileCheck -check-prefixes=OPT,OPT-W32 %s
				; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s \| FileCheck -check-prefixes=OPT,OPT-W64 %s

				; GCN-LABEL: {{^}}fold_wavefrontsize:
				; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(

				; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
				; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
				; GCN: store_dword v[{{[0-9:]+}}], [[V]]

				; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4
				; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4
				; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
				; OPT-WXX: store i32 %tmp, i32 addrspace(1)* %arg, align 4
				; OPT-NEXT: ret void

				define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
				bb:
				%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
				store i32 %tmp, i32 addrspace(1)* %arg, align 4
				ret void
				}

				; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
				; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(

				; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
				; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
				; GCN-NOT: cndmask
				; GCN: store_dword v[{{[0-9:]+}}], [[V]]

				; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4
				; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4
				; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
				; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
				; OPT-WXX: %tmp2 = select i1 %tmp1, i32 2, i32 1
				; OPT-WXX: store i32 %tmp2, i32 addrspace(1)* %arg
				; OPT-NEXT: ret void

				define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
				bb:
				%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
				%tmp1 = icmp ugt i32 %tmp, 32
				%tmp2 = select i1 %tmp1, i32 2, i32 1
				store i32 %tmp2, i32 addrspace(1)* %arg
				ret void
				}

				; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
				; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(

				; OPT: bb:
				; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
				; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
				; OPT-WXX: bb3:
				; OPT-W64: store i32 1, i32 addrspace(1)* %arg, align 4
				; OPT-NEXT: ret void

				define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
				bb:
				%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
				%tmp1 = icmp ugt i32 %tmp, 32
				br i1 %tmp1, label %bb2, label %bb3

				bb2: ; preds = %bb
				store i32 1, i32 addrspace(1)* %arg, align 4
				br label %bb3

				bb3: ; preds = %bb2, %bb
				ret void
				}

				declare i32 @llvm.amdgcn.wavefrontsize() #0

				attributes #0 = { nounwind readnone speculatable }