This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Partial ILP scheduler port from SelectionDAG to SchedulingDAG (experimental)
ClosedPublic

Authored by vpykhtin on Nov 10 2017, 5:09 AM.

Download Raw Diff

Details

Reviewers

rampitec
arsenm

Commits

rGf2fe9725eac3: AMDGPU: Partial ILP scheduler port from SelectionDAG to SchedulingDAG…
rL318649: AMDGPU: Partial ILP scheduler port from SelectionDAG to SchedulingDAG…

Summary

This is a partial port in that register pressure lowering checks aren't yet done because it isn't really straitforward to port them from the old place

There're some comments left copypasted from the old place, it may be outdated/irrelevant.

Diff Detail

Repository: rL LLVM

Event Timeline

vpykhtin created this revision.Nov 10 2017, 5:09 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptNov 10 2017, 5:09 AM

Can you add some tests just to show it does not crash? Maybe add run-lines to schedule-regpressure-limit.ll, schedule-regpressure-limit2.ll

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
224 ↗	(On Diff #122419)	Can we use ld/st clustering mutations and MacroFusion to it?

rampitec added inline comments.Nov 10 2017, 12:30 PM

lib/Target/AMDGPU/GCNILPSched.cpp
243 ↗	(On Diff #122419)	Can queue be empty?
283 ↗	(On Diff #122419)	spaces around ==.

fixed per review issues.

vpykhtin marked 3 inline comments as done.Nov 13 2017, 7:19 AM

rampitec added inline comments.Nov 13 2017, 9:17 AM

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
224 ↗	(On Diff #122419)	i think we need MacroFusion as well.

Added test. It turns out getOccupancyWithLocalMemSize function doesn't account for amdgpu-waves-per-eu attribute so I added call to getWavesPerEU (which does).

LGTM

lib/Target/AMDGPU/GCNIterativeScheduler.cpp
569 ↗	(On Diff #123341)	Looks like this call is missing in the GCNSchedStrategy as well, so needs to be added there as well.

This revision is now accepted and ready to land.Nov 17 2017, 8:50 AM

vpykhtin added inline comments.Nov 17 2017, 8:54 AM

lib/Target/AMDGPU/GCNIterativeScheduler.cpp
569 ↗	(On Diff #123341)	Right, but I wonder if this should be added into getOccupancyWithLocalMemSize itself? It uses getMaxWavesPerEU which probably should be changed to getWavesPerEU.

rampitec added inline comments.Nov 17 2017, 9:42 AM

lib/Target/AMDGPU/GCNIterativeScheduler.cpp
569 ↗	(On Diff #123341)	It probably should, though I believe it should go as a separate change.

arsenm added inline comments.Nov 17 2017, 4:46 PM

test/CodeGen/AMDGPU/schedule-ilp.ll
3 ↗	(On Diff #123341)	This isn't a useful check. Is there something more specific to check, like there are no spills or some other improvement over the default?

Closed by commit rL318649: AMDGPU: Partial ILP scheduler port from SelectionDAG to SchedulingDAG… (authored by vpykhtin). · Explain WhyNov 20 2017, 6:38 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

AMDGPUTargetMachine.cpp

15 lines

CMakeLists.txt

1 line

GCNILPSched.cpp

364 lines

GCNIterativeScheduler.h

4 lines

GCNIterativeScheduler.cpp

51 lines

test/

CodeGen/

AMDGPU/

schedule-ilp.ll

589 lines

Diff 123583

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 213 Lines • ▼ Show 20 Lines	createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
return DAG;		return DAG;
}		}

static ScheduleDAGInstrs createMinRegScheduler(MachineSchedContext C) {		static ScheduleDAGInstrs createMinRegScheduler(MachineSchedContext C) {
return new GCNIterativeScheduler(C,		return new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);		GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
}		}

		static ScheduleDAGInstrs *
		createIterativeILPMachineScheduler(MachineSchedContext *C) {
		auto DAG = new GCNIterativeScheduler(C,
		GCNIterativeScheduler::SCHEDULE_ILP);
		DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
		DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
		DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
		return DAG;
		}

static MachineSchedRegistry		static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",		R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);		createR600MachineScheduler);

static MachineSchedRegistry		static MachineSchedRegistry
SISchedRegistry("si", "Run SI's custom scheduler",		SISchedRegistry("si", "Run SI's custom scheduler",
createSIMachineScheduler);		createSIMachineScheduler);

static MachineSchedRegistry		static MachineSchedRegistry
GCNMaxOccupancySchedRegistry("gcn-max-occupancy",		GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
"Run GCN scheduler to maximize occupancy",		"Run GCN scheduler to maximize occupancy",
createGCNMaxOccupancyMachineScheduler);		createGCNMaxOccupancyMachineScheduler);

static MachineSchedRegistry		static MachineSchedRegistry
IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",		IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
"Run GCN scheduler to maximize occupancy (experimental)",		"Run GCN scheduler to maximize occupancy (experimental)",
createIterativeGCNMaxOccupancyMachineScheduler);		createIterativeGCNMaxOccupancyMachineScheduler);

static MachineSchedRegistry		static MachineSchedRegistry
GCNMinRegSchedRegistry("gcn-minreg",		GCNMinRegSchedRegistry("gcn-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",		"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);		createMinRegScheduler);

		static MachineSchedRegistry
		GCNILPSchedRegistry("gcn-ilp",
		"Run GCN iterative scheduler for ILP scheduling (experimental)",
		createIterativeILPMachineScheduler);

static StringRef computeDataLayout(const Triple &TT) {		static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {		if (TT.getArch() == Triple::r600) {
// 32-bit pointers.		// 32-bit pointers.
if (TT.getEnvironmentName() == "amdgiz" \|\|		if (TT.getEnvironmentName() == "amdgiz" \|\|
TT.getEnvironmentName() == "amdgizcl")		TT.getEnvironmentName() == "amdgizcl")
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"		return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";		"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"		return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
▲ Show 20 Lines • Show All 634 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	add_llvm_target(AMDGPUCodeGen
SIMachineScheduler.cpp		SIMachineScheduler.cpp
SIMemoryLegalizer.cpp		SIMemoryLegalizer.cpp
SIOptimizeExecMasking.cpp		SIOptimizeExecMasking.cpp
SIOptimizeExecMaskingPreRA.cpp		SIOptimizeExecMaskingPreRA.cpp
SIPeepholeSDWA.cpp		SIPeepholeSDWA.cpp
SIRegisterInfo.cpp		SIRegisterInfo.cpp
SIShrinkInstructions.cpp		SIShrinkInstructions.cpp
SIWholeQuadMode.cpp		SIWholeQuadMode.cpp
		GCNILPSched.cpp
)		)

add_subdirectory(AsmParser)		add_subdirectory(AsmParser)
add_subdirectory(InstPrinter)		add_subdirectory(InstPrinter)
add_subdirectory(Disassembler)		add_subdirectory(Disassembler)
add_subdirectory(TargetInfo)		add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)		add_subdirectory(MCTargetDesc)
add_subdirectory(Utils)		add_subdirectory(Utils)

llvm/trunk/lib/Target/AMDGPU/GCNILPSched.cpp

				//===---------------------------- GCNILPSched.cpp - -----------------------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				/// \file
				//
				//===----------------------------------------------------------------------===//

				#include "llvm/CodeGen/ScheduleDAG.h"

				using namespace llvm;

				#define DEBUG_TYPE "machine-scheduler"

				namespace {

				class GCNILPScheduler {
				struct Candidate : ilist_node<Candidate> {
				SUnit *SU;

				Candidate(SUnit *SU_)
				: SU(SU_) {}
				};

				SpecificBumpPtrAllocator<Candidate> Alloc;
				typedef simple_ilist<Candidate> Queue;
				Queue PendingQueue;
				Queue AvailQueue;
				unsigned CurQueueId = 0;

				std::vector<unsigned> SUNumbers;

				/// CurCycle - The current scheduler state corresponds to this cycle.
				unsigned CurCycle = 0;

				unsigned getNodePriority(const SUnit *SU) const;

				const SUnit pickBest(const SUnit left, const SUnit *right);
				Candidate* pickCandidate();

				void releasePending();
				void advanceToCycle(unsigned NextCycle);
				void releasePredecessors(const SUnit* SU);

				public:
				std::vector<const SUnit> schedule(ArrayRef<const SUnit> TopRoots,
				const ScheduleDAG &DAG);
				};
				} // namespace

				/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
				/// Smaller number is the higher priority.
				static unsigned
				CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
				unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
				if (SethiUllmanNumber != 0)
				return SethiUllmanNumber;

				unsigned Extra = 0;
				for (const SDep &Pred : SU->Preds) {
				if (Pred.isCtrl()) continue; // ignore chain preds
				SUnit *PredSU = Pred.getSUnit();
				unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
				if (PredSethiUllman > SethiUllmanNumber) {
				SethiUllmanNumber = PredSethiUllman;
				Extra = 0;
				}
				else if (PredSethiUllman == SethiUllmanNumber)
				++Extra;
				}

				SethiUllmanNumber += Extra;

				if (SethiUllmanNumber == 0)
				SethiUllmanNumber = 1;

				return SethiUllmanNumber;
				}

				// Lower priority means schedule further down. For bottom-up scheduling, lower
				// priority SUs are scheduled before higher priority SUs.
				unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const {
				assert(SU->NodeNum < SUNumbers.size());
				if (SU->NumSuccs == 0 && SU->NumPreds != 0)
				// If SU does not have a register use, i.e. it doesn't produce a value
				// that would be consumed (e.g. store), then it terminates a chain of
				// computation. Give it a large SethiUllman number so it will be
				// scheduled right before its predecessors that it doesn't lengthen
				// their live ranges.
				return 0xffff;

				if (SU->NumPreds == 0 && SU->NumSuccs != 0)
				// If SU does not have a register def, schedule it close to its uses
				// because it does not lengthen any live ranges.
				return 0;

				return SUNumbers[SU->NodeNum];
				}

				/// closestSucc - Returns the scheduled cycle of the successor which is
				/// closest to the current cycle.
				static unsigned closestSucc(const SUnit *SU) {
				unsigned MaxHeight = 0;
				for (const SDep &Succ : SU->Succs) {
				if (Succ.isCtrl()) continue; // ignore chain succs
				unsigned Height = Succ.getSUnit()->getHeight();
				// If there are bunch of CopyToRegs stacked up, they should be considered
				// to be at the same position.
				if (Height > MaxHeight)
				MaxHeight = Height;
				}
				return MaxHeight;
				}

				/// calcMaxScratches - Returns an cost estimate of the worse case requirement
				/// for scratch registers, i.e. number of data dependencies.
				static unsigned calcMaxScratches(const SUnit *SU) {
				unsigned Scratches = 0;
				for (const SDep &Pred : SU->Preds) {
				if (Pred.isCtrl()) continue; // ignore chain preds
				Scratches++;
				}
				return Scratches;
				}

				// Return -1 if left has higher priority, 1 if right has higher priority.
				// Return 0 if latency-based priority is equivalent.
				static int BUCompareLatency(const SUnit left, const SUnit right) {
				// Scheduling an instruction that uses a VReg whose postincrement has not yet
				// been scheduled will induce a copy. Model this as an extra cycle of latency.
				int LHeight = (int)left->getHeight();
				int RHeight = (int)right->getHeight();

				// If either node is scheduling for latency, sort them by height/depth
				// and latency.

				// If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
				// is enabled, grouping instructions by cycle, then its height is already
				// covered so only its depth matters. We also reach this point if both stall
				// but have the same height.
				if (LHeight != RHeight)
				return LHeight > RHeight ? 1 : -1;

				int LDepth = left->getDepth();
				int RDepth = right->getDepth();
				if (LDepth != RDepth) {
				DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
				<< ") depth " << LDepth << " vs SU (" << right->NodeNum
				<< ") depth " << RDepth << "\n");
				return LDepth < RDepth ? 1 : -1;
				}
				if (left->Latency != right->Latency)
				return left->Latency > right->Latency ? 1 : -1;

				return 0;
				}

				const SUnit GCNILPScheduler::pickBest(const SUnit left, const SUnit *right)
				{
				// TODO: add register pressure lowering checks

				bool const DisableSchedCriticalPath = false;
				int MaxReorderWindow = 6;
				if (!DisableSchedCriticalPath) {
				int spread = (int)left->getDepth() - (int)right->getDepth();
				if (std::abs(spread) > MaxReorderWindow) {
				DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
				<< left->getDepth() << " != SU(" << right->NodeNum << "): "
				<< right->getDepth() << "\n");
				return left->getDepth() < right->getDepth() ? right : left;
				}
				}

				bool const DisableSchedHeight = false;
				if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
				int spread = (int)left->getHeight() - (int)right->getHeight();
				if (std::abs(spread) > MaxReorderWindow)
				return left->getHeight() > right->getHeight() ? right : left;
				}

				// Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
				unsigned LPriority = getNodePriority(left);
				unsigned RPriority = getNodePriority(right);

				if (LPriority != RPriority)
				return LPriority > RPriority ? right : left;

				// Try schedule def + use closer when Sethi-Ullman numbers are the same.
				// e.g.
				// t1 = op t2, c1
				// t3 = op t4, c2
				//
				// and the following instructions are both ready.
				// t2 = op c3
				// t4 = op c4
				//
				// Then schedule t2 = op first.
				// i.e.
				// t4 = op c4
				// t2 = op c3
				// t1 = op t2, c1
				// t3 = op t4, c2
				//
				// This creates more short live intervals.
				unsigned LDist = closestSucc(left);
				unsigned RDist = closestSucc(right);
				if (LDist != RDist)
				return LDist < RDist ? right : left;

				// How many registers becomes live when the node is scheduled.
				unsigned LScratch = calcMaxScratches(left);
				unsigned RScratch = calcMaxScratches(right);
				if (LScratch != RScratch)
				return LScratch > RScratch ? right : left;

				bool const DisableSchedCycles = false;
				if (!DisableSchedCycles) {
				int result = BUCompareLatency(left, right);
				if (result != 0)
				return result > 0 ? right : left;
				return left;
				}
				else {
				if (left->getHeight() != right->getHeight())
				return (left->getHeight() > right->getHeight()) ? right : left;

				if (left->getDepth() != right->getDepth())
				return (left->getDepth() < right->getDepth()) ? right : left;
				}

				assert(left->NodeQueueId && right->NodeQueueId &&
				"NodeQueueId cannot be zero");
				return (left->NodeQueueId > right->NodeQueueId) ? right : left;
				}

				GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() {
				if (AvailQueue.empty())
				return nullptr;
				auto Best = AvailQueue.begin();
				for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) {
				auto NewBestSU = pickBest(Best->SU, I->SU);
				if (NewBestSU != Best->SU) {
				assert(NewBestSU == I->SU);
				Best = I;
				}
				}
				return &*Best;
				}

				void GCNILPScheduler::releasePending() {
				// Check to see if any of the pending instructions are ready to issue. If
				// so, add them to the available queue.
				for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) {
				auto &C = *I++;
				if (C.SU->getHeight() <= CurCycle) {
				PendingQueue.remove(C);
				AvailQueue.push_back(C);
				C.SU->NodeQueueId = CurQueueId++;
				}
				}
				}

				/// Move the scheduler state forward by the specified number of Cycles.
				void GCNILPScheduler::advanceToCycle(unsigned NextCycle) {
				if (NextCycle <= CurCycle)
				return;
				CurCycle = NextCycle;
				releasePending();
				}

				void GCNILPScheduler::releasePredecessors(const SUnit* SU) {
				for (const auto &PredEdge : SU->Preds) {
				auto PredSU = PredEdge.getSUnit();
				if (PredEdge.isWeak())
				continue;
				assert(PredSU->isBoundaryNode() \|\| PredSU->NumSuccsLeft > 0);

				PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency());

				if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0)
				PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU));
				}
				}

				std::vector<const SUnit*>
				GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
				const ScheduleDAG &DAG) {
				auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits;

				std::vector<SUnit> SUSavedCopy;
				SUSavedCopy.resize(SUnits.size());

				// we cannot save only those fields we touch: some of them are private
				// so save units verbatim: this assumes SUnit should have value semantics
				for (const SUnit &SU : SUnits)
				SUSavedCopy[SU.NodeNum] = SU;

				SUNumbers.assign(SUnits.size(), 0);
				for (const SUnit &SU : SUnits)
				CalcNodeSethiUllmanNumber(&SU, SUNumbers);

				for (auto SU : BotRoots) {
				AvailQueue.push_back(
				new (Alloc.Allocate()) Candidate(const_cast<SUnit>(SU)));
				}
				releasePredecessors(&DAG.ExitSU);

				std::vector<const SUnit*> Schedule;
				Schedule.reserve(SUnits.size());
				while (true) {
				if (AvailQueue.empty() && !PendingQueue.empty()) {
				auto EarliestSU = std::min_element(
				PendingQueue.begin(), PendingQueue.end(),
				[=](const Candidate& C1, const Candidate& C2) {
				return C1.SU->getHeight() < C2.SU->getHeight();
				})->SU;
				advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight()));
				}
				if (AvailQueue.empty())
				break;

				DEBUG(
				dbgs() << "\n=== Picking candidate\n"
				"Ready queue:";
				for (auto &C : AvailQueue)
				dbgs() << ' ' << C.SU->NodeNum;
				dbgs() << '\n';
				);

				auto C = pickCandidate();
				assert(C);
				AvailQueue.remove(*C);
				auto SU = C->SU;
				DEBUG(dbgs() << "Selected "; SU->dump(&DAG));

				advanceToCycle(SU->getHeight());

				releasePredecessors(SU);
				Schedule.push_back(SU);
				SU->isScheduled = true;
				}
				assert(SUnits.size() == Schedule.size());

				std::reverse(Schedule.begin(), Schedule.end());

				// restore units
				for (auto &SU : SUnits)
				SU = SUSavedCopy[SU.NodeNum];

				return Schedule;
				}

				namespace llvm {
				std::vector<const SUnit> makeGCNILPScheduler(ArrayRef<const SUnit> BotRoots,
				const ScheduleDAG &DAG) {
				GCNILPScheduler S;
				return S.schedule(BotRoots, DAG);
				}
				}

llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h

Show All 26 Lines

class GCNIterativeScheduler : public ScheduleDAGMILive {		class GCNIterativeScheduler : public ScheduleDAGMILive {
using BaseClass = ScheduleDAGMILive;		using BaseClass = ScheduleDAGMILive;

public:		public:
enum StrategyKind {		enum StrategyKind {
SCHEDULE_MINREGONLY,		SCHEDULE_MINREGONLY,
SCHEDULE_MINREGFORCED,		SCHEDULE_MINREGFORCED,
SCHEDULE_LEGACYMAXOCCUPANCY		SCHEDULE_LEGACYMAXOCCUPANCY,
		SCHEDULE_ILP
};		};

GCNIterativeScheduler(MachineSchedContext *C,		GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S);		StrategyKind S);

void schedule() override;		void schedule() override;

void enterRegion(MachineBasicBlock *BB,		void enterRegion(MachineBasicBlock *BB,
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	protected:
void scheduleRegion(Region &R, Range &&Schedule,		void scheduleRegion(Region &R, Range &&Schedule,
const GCNRegPressure &MaxRP = GCNRegPressure());		const GCNRegPressure &MaxRP = GCNRegPressure());

unsigned tryMaximizeOccupancy(unsigned TargetOcc =		unsigned tryMaximizeOccupancy(unsigned TargetOcc =
std::numeric_limits<unsigned>::max());		std::numeric_limits<unsigned>::max());

void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);		void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
void scheduleMinReg(bool force = false);		void scheduleMinReg(bool force = false);
		void scheduleILP(bool TryMaximizeOccupancy = true);

void printRegions(raw_ostream &OS) const;		void printRegions(raw_ostream &OS) const;
void printSchedResult(raw_ostream &OS,		void printSchedResult(raw_ostream &OS,
const Region *R,		const Region *R,
const GCNRegPressure &RP) const;		const GCNRegPressure &RP) const;
void printSchedRP(raw_ostream &OS,		void printSchedRP(raw_ostream &OS,
const GCNRegPressure &Before,		const GCNRegPressure &Before,
const GCNRegPressure &After) const;		const GCNRegPressure &After) const;
};		};

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H		#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H

llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp

Show All 33 Lines

#define DEBUG_TYPE "machine-scheduler"		#define DEBUG_TYPE "machine-scheduler"

namespace llvm {		namespace llvm {

std::vector<const SUnit > makeMinRegSchedule(ArrayRef<const SUnit > TopRoots,		std::vector<const SUnit > makeMinRegSchedule(ArrayRef<const SUnit > TopRoots,
const ScheduleDAG &DAG);		const ScheduleDAG &DAG);

} // end namespace llvm		std::vector<const SUnit> makeGCNILPScheduler(ArrayRef<const SUnit> BotRoots,
		const ScheduleDAG &DAG);
		}

// shim accessors for different order containers		// shim accessors for different order containers
static inline MachineInstr getMachineInstr(MachineInstr MI) {		static inline MachineInstr getMachineInstr(MachineInstr MI) {
return MI;		return MI;
}		}
static inline MachineInstr getMachineInstr(const SUnit SU) {		static inline MachineInstr getMachineInstr(const SUnit SU) {
return SU->getInstr();		return SU->getInstr();
}		}
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines
}		}
#endif		#endif

// DAG builder helper		// DAG builder helper
class GCNIterativeScheduler::BuildDAG {		class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;		GCNIterativeScheduler &Sch;
SmallVector<SUnit *, 8> TopRoots;		SmallVector<SUnit *, 8> TopRoots;

		SmallVector<SUnit*, 8> BotRoots;
public:		public:
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)		BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
: Sch(_Sch) {		: Sch(_Sch) {
auto BB = R.Begin->getParent();		auto BB = R.Begin->getParent();
Sch.BaseClass::startBlock(BB);		Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);		Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);

Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,		Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/TrackLaneMask/true);		/TrackLaneMask/true);
Sch.Topo.InitDAGTopologicalSorting();		Sch.Topo.InitDAGTopologicalSorting();

SmallVector<SUnit *, 8> BotRoots;
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);		Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}		}

~BuildDAG() {		~BuildDAG() {
Sch.BaseClass::exitRegion();		Sch.BaseClass::exitRegion();
Sch.BaseClass::finishBlock();		Sch.BaseClass::finishBlock();
}		}

ArrayRef<const SUnit *> getTopRoots() const {		ArrayRef<const SUnit *> getTopRoots() const {
return TopRoots;		return TopRoots;
}		}
		ArrayRef<SUnit*> getBottomRoots() const {
		return BotRoots;
		}
};		};

class GCNIterativeScheduler::OverrideLegacyStrategy {		class GCNIterativeScheduler::OverrideLegacyStrategy {
GCNIterativeScheduler &Sch;		GCNIterativeScheduler &Sch;
Region &Rgn;		Region &Rgn;
std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;		std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
GCNRegPressure SaveMaxRP;		GCNRegPressure SaveMaxRP;

▲ Show 20 Lines • Show All 143 Lines • ▼ Show 20 Lines

void GCNIterativeScheduler::finalizeSchedule() { // overriden		void GCNIterativeScheduler::finalizeSchedule() { // overriden
if (Regions.empty())		if (Regions.empty())
return;		return;
switch (Strategy) {		switch (Strategy) {
case SCHEDULE_MINREGONLY: scheduleMinReg(); break;		case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;		case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;		case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
		case SCHEDULE_ILP: scheduleILP(false); break;
}		}
}		}

// Detach schedule from SUnits and interleave it with debug values.		// Detach schedule from SUnits and interleave it with debug values.
// Returned schedule becomes independent of DAG state.		// Returned schedule becomes independent of DAG state.
std::vector<MachineInstr*>		std::vector<MachineInstr*>
GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {		GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
std::vector<MachineInstr*> Res;		std::vector<MachineInstr*> Res;
▲ Show 20 Lines • Show All 214 Lines • ▼ Show 20 Lines	if (!force && MaxPressure.less(ST, RP, TgtOcc))
break;		break;

scheduleRegion(*R, MinSchedule, RP);		scheduleRegion(*R, MinSchedule, RP);
DEBUG(printSchedResult(dbgs(), R, RP));		DEBUG(printSchedResult(dbgs(), R, RP));

MaxPressure = RP;		MaxPressure = RP;
}		}
}		}

		///////////////////////////////////////////////////////////////////////////////
		// ILP scheduler port

		void GCNIterativeScheduler::scheduleILP(
		bool TryMaximizeOccupancy) {
		const auto &ST = MF.getSubtarget<SISubtarget>();
		auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
		ST.getWavesPerEU(*MF.getFunction()).second);

		sortRegionsByPressure(TgtOcc);
		auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);

		if (TryMaximizeOccupancy && Occ < TgtOcc)
		Occ = tryMaximizeOccupancy(TgtOcc);

		TgtOcc = std::min(Occ, TgtOcc);
		DEBUG(dbgs() << "Scheduling using default scheduler, "
		"target occupancy = " << TgtOcc << '\n');

		for (auto R : Regions) {
		BuildDAG DAG(R, this);
		const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);

		const auto RP = getSchedulePressure(*R, ILPSchedule);
		DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

		if (RP.getOccupancy(ST) < TgtOcc) {
		DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
		if (R->BestSchedule.get() &&
		R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
		DEBUG(dbgs() << ", scheduling minimal register\n");
		scheduleBest(*R);
		}
		} else {
		scheduleRegion(*R, ILPSchedule, RP);
		DEBUG(printSchedResult(dbgs(), R, RP));
		}
		}
		}

llvm/trunk/test/CodeGen/AMDGPU/schedule-ilp.ll

				; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s \| FileCheck %s

				; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}

				define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #0 {
				bb:
				%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
				%tmp2 = load float, float addrspace(3)* %tmp, align 4
				%tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
				%tmp4 = load float, float addrspace(3)* %tmp3, align 4
				%tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
				%tmp6 = load float, float addrspace(3)* %tmp5, align 4
				%tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
				%tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
				%tmp9 = load float, float addrspace(3)* %tmp8, align 4
				%tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
				%tmp11 = load float, float addrspace(3)* %tmp10, align 4
				%tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
				%tmp13 = load float, float addrspace(3)* %tmp12, align 4
				%tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
				%tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
				%tmp16 = load float, float addrspace(3)* %tmp15, align 4
				%tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
				%tmp18 = load float, float addrspace(3)* %tmp17, align 4
				%tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
				%tmp20 = load float, float addrspace(3)* %tmp19, align 4
				%tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
				%tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
				%tmp23 = load float, float addrspace(3)* %tmp22, align 4
				%tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
				%tmp25 = load float, float addrspace(3)* %tmp24, align 4
				%tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
				%tmp27 = load float, float addrspace(3)* %tmp26, align 4
				%tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
				%tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
				%tmp30 = load float, float addrspace(3)* %tmp29, align 4
				%tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
				%tmp32 = load float, float addrspace(3)* %tmp31, align 4
				%tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
				%tmp34 = load float, float addrspace(3)* %tmp33, align 4
				%tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
				%tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
				%tmp37 = load float, float addrspace(3)* %tmp36, align 4
				%tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
				%tmp39 = load float, float addrspace(3)* %tmp38, align 4
				%tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
				%tmp41 = load float, float addrspace(3)* %tmp40, align 4
				%tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
				%tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
				%tmp44 = load float, float addrspace(3)* %tmp43, align 4
				%tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
				%tmp46 = load float, float addrspace(3)* %tmp45, align 4
				%tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
				%tmp48 = load float, float addrspace(3)* %tmp47, align 4
				%tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
				%tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
				%tmp51 = load float, float addrspace(3)* %tmp50, align 4
				%tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
				%tmp53 = load float, float addrspace(3)* %tmp52, align 4
				%tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
				%tmp55 = load float, float addrspace(3)* %tmp54, align 4
				%tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
				%tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
				%tmp58 = load float, float addrspace(3)* %tmp57, align 4
				%tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
				%tmp60 = load float, float addrspace(3)* %tmp59, align 4
				%tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
				%tmp62 = load float, float addrspace(3)* %tmp61, align 4
				%tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
				%tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
				%tmp65 = load float, float addrspace(3)* %tmp64, align 4
				%tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
				%tmp67 = load float, float addrspace(3)* %tmp66, align 4
				%tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
				%tmp69 = load float, float addrspace(3)* %tmp68, align 4
				%tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
				%tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
				%tmp72 = load float, float addrspace(3)* %tmp71, align 4
				%tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
				%tmp74 = load float, float addrspace(3)* %tmp73, align 4
				%tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
				%tmp76 = load float, float addrspace(3)* %tmp75, align 4
				%tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
				%tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
				%tmp79 = load float, float addrspace(3)* %tmp78, align 4
				%tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
				%tmp81 = load float, float addrspace(3)* %tmp80, align 4
				%tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
				%tmp83 = load float, float addrspace(3)* %tmp82, align 4
				%tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
				%tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
				%tmp86 = load float, float addrspace(3)* %tmp85, align 4
				%tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
				%tmp88 = load float, float addrspace(3)* %tmp87, align 4
				%tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
				%tmp90 = load float, float addrspace(3)* %tmp89, align 4
				%tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
				%tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
				%tmp93 = load float, float addrspace(3)* %tmp92, align 4
				%tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
				%tmp95 = load float, float addrspace(3)* %tmp94, align 4
				%tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
				%tmp97 = load float, float addrspace(3)* %tmp96, align 4
				%tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
				%tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
				%tmp100 = load float, float addrspace(3)* %tmp99, align 4
				%tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
				%tmp102 = load float, float addrspace(3)* %tmp101, align 4
				%tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
				%tmp104 = load float, float addrspace(3)* %tmp103, align 4
				%tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
				%tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
				%tmp107 = load float, float addrspace(3)* %tmp106, align 4
				%tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
				%tmp109 = load float, float addrspace(3)* %tmp108, align 4
				%tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
				%tmp111 = load float, float addrspace(3)* %tmp110, align 4
				%tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
				%tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
				%tmp114 = load float, float addrspace(3)* %tmp113, align 4
				%tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
				%tmp116 = load float, float addrspace(3)* %tmp115, align 4
				%tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
				%tmp118 = load float, float addrspace(3)* %tmp117, align 4
				%tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
				%tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
				%tmp121 = load float, float addrspace(3)* %tmp120, align 4
				%tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
				%tmp123 = load float, float addrspace(3)* %tmp122, align 4
				%tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
				%tmp125 = load float, float addrspace(3)* %tmp124, align 4
				%tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
				%tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
				%tmp128 = load float, float addrspace(3)* %tmp127, align 4
				%tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
				%tmp130 = load float, float addrspace(3)* %tmp129, align 4
				%tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
				%tmp132 = load float, float addrspace(3)* %tmp131, align 4
				%tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
				%tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
				%tmp135 = load float, float addrspace(3)* %tmp134, align 4
				%tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
				%tmp137 = load float, float addrspace(3)* %tmp136, align 4
				%tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
				%tmp139 = load float, float addrspace(3)* %tmp138, align 4
				%tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
				%tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
				%tmp142 = load float, float addrspace(3)* %tmp141, align 4
				%tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
				%tmp144 = load float, float addrspace(3)* %tmp143, align 4
				%tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
				%tmp146 = load float, float addrspace(3)* %tmp145, align 4
				%tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
				%tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
				%tmp149 = load float, float addrspace(3)* %tmp148, align 4
				%tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
				%tmp151 = load float, float addrspace(3)* %tmp150, align 4
				%tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
				%tmp153 = load float, float addrspace(3)* %tmp152, align 4
				%tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
				%tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
				%tmp156 = load float, float addrspace(3)* %tmp155, align 4
				%tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
				%tmp158 = load float, float addrspace(3)* %tmp157, align 4
				%tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
				%tmp160 = load float, float addrspace(3)* %tmp159, align 4
				%tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
				%tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
				%tmp163 = load float, float addrspace(3)* %tmp162, align 4
				%tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
				%tmp165 = load float, float addrspace(3)* %tmp164, align 4
				%tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
				%tmp167 = load float, float addrspace(3)* %tmp166, align 4
				%tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
				%tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
				%tmp170 = load float, float addrspace(3)* %tmp169, align 4
				%tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
				%tmp172 = load float, float addrspace(3)* %tmp171, align 4
				%tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
				%tmp174 = load float, float addrspace(3)* %tmp173, align 4
				%tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
				%tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
				%tmp177 = load float, float addrspace(3)* %tmp176, align 4
				%tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
				%tmp179 = load float, float addrspace(3)* %tmp178, align 4
				%tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
				%tmp181 = load float, float addrspace(3)* %tmp180, align 4
				%tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
				%tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
				%tmp184 = load float, float addrspace(3)* %tmp183, align 4
				%tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
				%tmp186 = load float, float addrspace(3)* %tmp185, align 4
				%tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
				%tmp188 = load float, float addrspace(3)* %tmp187, align 4
				%tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
				%tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
				%tmp191 = load float, float addrspace(3)* %tmp190, align 4
				%tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
				%tmp193 = load float, float addrspace(3)* %tmp192, align 4
				%tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
				%tmp195 = load float, float addrspace(3)* %tmp194, align 4
				%tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
				%tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
				%tmp198 = load float, float addrspace(3)* %tmp197, align 4
				%tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
				%tmp200 = load float, float addrspace(3)* %tmp199, align 4
				%tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
				%tmp202 = load float, float addrspace(3)* %tmp201, align 4
				%tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
				%tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
				%tmp205 = load float, float addrspace(3)* %tmp204, align 4
				%tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
				%tmp207 = load float, float addrspace(3)* %tmp206, align 4
				%tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
				%tmp209 = load float, float addrspace(3)* %tmp208, align 4
				%tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
				%tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
				%tmp212 = load float, float addrspace(3)* %tmp211, align 4
				%tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
				%tmp214 = load float, float addrspace(3)* %tmp213, align 4
				%tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
				%tmp216 = load float, float addrspace(3)* %tmp215, align 4
				%tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
				%tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
				%tmp219 = load float, float addrspace(3)* %tmp218, align 4
				%tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
				%tmp221 = load float, float addrspace(3)* %tmp220, align 4
				%tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
				%tmp223 = load float, float addrspace(3)* %tmp222, align 4
				%tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
				%tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
				%tmp226 = load float, float addrspace(3)* %tmp225, align 4
				%tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
				%tmp228 = load float, float addrspace(3)* %tmp227, align 4
				%tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
				%tmp230 = load float, float addrspace(3)* %tmp229, align 4
				%tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
				%tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
				%tmp233 = load float, float addrspace(3)* %tmp232, align 4
				%tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
				%tmp235 = load float, float addrspace(3)* %tmp234, align 4
				%tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
				%tmp237 = load float, float addrspace(3)* %tmp236, align 4
				%tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
				%tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
				%tmp240 = load float, float addrspace(3)* %tmp239, align 4
				%tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
				%tmp242 = load float, float addrspace(3)* %tmp241, align 4
				%tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
				%tmp244 = load float, float addrspace(3)* %tmp243, align 4
				%tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
				%tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
				%tmp247 = load float, float addrspace(3)* %tmp246, align 4
				%tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
				%tmp249 = load float, float addrspace(3)* %tmp248, align 4
				%tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
				%tmp251 = load float, float addrspace(3)* %tmp250, align 4
				%tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
				%tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
				%tmp254 = load float, float addrspace(3)* %tmp253, align 4
				%tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
				%tmp256 = load float, float addrspace(3)* %tmp255, align 4
				%tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
				%tmp258 = load float, float addrspace(3)* %tmp257, align 4
				%tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
				%tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
				%tmp261 = load float, float addrspace(3)* %tmp260, align 4
				%tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
				%tmp263 = load float, float addrspace(3)* %tmp262, align 4
				%tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
				%tmp265 = load float, float addrspace(3)* %tmp264, align 4
				%tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
				%tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
				%tmp268 = load float, float addrspace(3)* %tmp267, align 4
				%tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
				%tmp270 = load float, float addrspace(3)* %tmp269, align 4
				%tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
				%tmp272 = load float, float addrspace(3)* %tmp271, align 4
				%tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
				%tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
				%tmp275 = load float, float addrspace(3)* %tmp274, align 4
				%tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
				%tmp277 = load float, float addrspace(3)* %tmp276, align 4
				%tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
				%tmp279 = load float, float addrspace(3)* %tmp278, align 4
				%tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
				%tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
				%tmp282 = load float, float addrspace(3)* %tmp281, align 4
				%tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
				%tmp284 = load float, float addrspace(3)* %tmp283, align 4
				%tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
				%tmp286 = load float, float addrspace(3)* %tmp285, align 4
				%tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
				%tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
				%tmp289 = load float, float addrspace(3)* %tmp288, align 4
				%tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
				%tmp291 = load float, float addrspace(3)* %tmp290, align 4
				%tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
				%tmp293 = load float, float addrspace(3)* %tmp292, align 4
				%tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
				%tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
				%tmp296 = load float, float addrspace(3)* %tmp295, align 4
				%tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
				%tmp298 = load float, float addrspace(3)* %tmp297, align 4
				%tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
				%tmp300 = load float, float addrspace(3)* %tmp299, align 4
				%tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
				%tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
				%tmp303 = load float, float addrspace(3)* %tmp302, align 4
				%tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
				%tmp305 = load float, float addrspace(3)* %tmp304, align 4
				%tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
				%tmp307 = load float, float addrspace(3)* %tmp306, align 4
				%tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
				%tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
				%tmp310 = load float, float addrspace(3)* %tmp309, align 4
				%tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
				%tmp312 = load float, float addrspace(3)* %tmp311, align 4
				%tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
				%tmp314 = load float, float addrspace(3)* %tmp313, align 4
				%tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
				%tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
				%tmp317 = load float, float addrspace(3)* %tmp316, align 4
				%tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
				%tmp319 = load float, float addrspace(3)* %tmp318, align 4
				%tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
				%tmp321 = load float, float addrspace(3)* %tmp320, align 4
				%tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
				%tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
				%tmp324 = load float, float addrspace(3)* %tmp323, align 4
				%tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
				%tmp326 = load float, float addrspace(3)* %tmp325, align 4
				%tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
				%tmp328 = load float, float addrspace(3)* %tmp327, align 4
				%tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
				%tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
				%tmp331 = load float, float addrspace(3)* %tmp330, align 4
				%tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
				%tmp333 = load float, float addrspace(3)* %tmp332, align 4
				%tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
				%tmp335 = load float, float addrspace(3)* %tmp334, align 4
				%tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
				%tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
				%tmp338 = load float, float addrspace(3)* %tmp337, align 4
				%tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
				%tmp340 = load float, float addrspace(3)* %tmp339, align 4
				%tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
				%tmp342 = load float, float addrspace(3)* %tmp341, align 4
				%tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
				%tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
				%tmp345 = load float, float addrspace(3)* %tmp344, align 4
				%tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
				%tmp347 = load float, float addrspace(3)* %tmp346, align 4
				%tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
				%tmp349 = load float, float addrspace(3)* %tmp348, align 4
				%tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
				%tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
				%tmp352 = load float, float addrspace(3)* %tmp351, align 4
				%tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
				%tmp354 = load float, float addrspace(3)* %tmp353, align 4
				%tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
				%tmp356 = load float, float addrspace(3)* %tmp355, align 4
				%tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
				%tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
				%tmp359 = load float, float addrspace(3)* %tmp358, align 4
				%tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
				%tmp361 = load float, float addrspace(3)* %tmp360, align 4
				%tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
				%tmp363 = load float, float addrspace(3)* %tmp362, align 4
				%tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
				%tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
				%tmp366 = load float, float addrspace(3)* %tmp365, align 4
				%tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
				%tmp368 = load float, float addrspace(3)* %tmp367, align 4
				%tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
				%tmp370 = load float, float addrspace(3)* %tmp369, align 4
				%tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
				%tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
				%tmp373 = load float, float addrspace(3)* %tmp372, align 4
				%tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
				%tmp375 = load float, float addrspace(3)* %tmp374, align 4
				%tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
				%tmp377 = load float, float addrspace(3)* %tmp376, align 4
				%tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
				%tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
				%tmp380 = load float, float addrspace(3)* %tmp379, align 4
				%tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
				%tmp382 = load float, float addrspace(3)* %tmp381, align 4
				%tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
				%tmp384 = load float, float addrspace(3)* %tmp383, align 4
				%tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
				%tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
				%tmp387 = load float, float addrspace(3)* %tmp386, align 4
				%tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
				%tmp389 = load float, float addrspace(3)* %tmp388, align 4
				%tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
				%tmp391 = load float, float addrspace(3)* %tmp390, align 4
				%tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
				%tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
				%tmp394 = load float, float addrspace(3)* %tmp393, align 4
				%tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
				%tmp396 = load float, float addrspace(3)* %tmp395, align 4
				%tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
				%tmp398 = load float, float addrspace(3)* %tmp397, align 4
				%tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
				%tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
				%tmp401 = load float, float addrspace(3)* %tmp400, align 4
				%tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
				%tmp403 = load float, float addrspace(3)* %tmp402, align 4
				%tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
				%tmp405 = load float, float addrspace(3)* %tmp404, align 4
				%tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
				%tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
				%tmp408 = load float, float addrspace(3)* %tmp407, align 4
				%tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
				%tmp410 = load float, float addrspace(3)* %tmp409, align 4
				%tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
				%tmp412 = load float, float addrspace(3)* %tmp411, align 4
				%tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
				%tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
				%tmp415 = load float, float addrspace(3)* %tmp414, align 4
				%tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
				%tmp417 = load float, float addrspace(3)* %tmp416, align 4
				%tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
				%tmp419 = load float, float addrspace(3)* %tmp418, align 4
				%tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
				%tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
				%tmp422 = load float, float addrspace(3)* %tmp421, align 4
				%tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
				%tmp424 = load float, float addrspace(3)* %tmp423, align 4
				%tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
				%tmp426 = load float, float addrspace(3)* %tmp425, align 4
				%tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
				%tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
				%tmp429 = load float, float addrspace(3)* %tmp428, align 4
				%tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
				%tmp431 = load float, float addrspace(3)* %tmp430, align 4
				%tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
				%tmp433 = load float, float addrspace(3)* %tmp432, align 4
				%tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
				%tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
				%tmp436 = load float, float addrspace(3)* %tmp435, align 4
				%tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
				%tmp438 = load float, float addrspace(3)* %tmp437, align 4
				%tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
				%tmp440 = load float, float addrspace(3)* %tmp439, align 4
				%tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
				%tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
				%tmp443 = load float, float addrspace(3)* %tmp442, align 4
				%tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
				%tmp445 = load float, float addrspace(3)* %tmp444, align 4
				%tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
				%tmp447 = load float, float addrspace(3)* %tmp446, align 4
				%tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
				store float %tmp7, float addrspace(1)* %arg1, align 4
				%tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
				store float %tmp14, float addrspace(1)* %tmp449, align 4
				%tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
				store float %tmp21, float addrspace(1)* %tmp450, align 4
				%tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
				store float %tmp28, float addrspace(1)* %tmp451, align 4
				%tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
				store float %tmp35, float addrspace(1)* %tmp452, align 4
				%tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
				store float %tmp42, float addrspace(1)* %tmp453, align 4
				%tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
				store float %tmp49, float addrspace(1)* %tmp454, align 4
				%tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
				store float %tmp56, float addrspace(1)* %tmp455, align 4
				%tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
				store float %tmp63, float addrspace(1)* %tmp456, align 4
				%tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
				store float %tmp70, float addrspace(1)* %tmp457, align 4
				%tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
				store float %tmp77, float addrspace(1)* %tmp458, align 4
				%tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
				store float %tmp84, float addrspace(1)* %tmp459, align 4
				%tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
				store float %tmp91, float addrspace(1)* %tmp460, align 4
				%tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
				store float %tmp98, float addrspace(1)* %tmp461, align 4
				%tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
				store float %tmp105, float addrspace(1)* %tmp462, align 4
				%tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
				store float %tmp112, float addrspace(1)* %tmp463, align 4
				%tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
				store float %tmp119, float addrspace(1)* %tmp464, align 4
				%tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
				store float %tmp126, float addrspace(1)* %tmp465, align 4
				%tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
				store float %tmp133, float addrspace(1)* %tmp466, align 4
				%tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
				store float %tmp140, float addrspace(1)* %tmp467, align 4
				%tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
				store float %tmp147, float addrspace(1)* %tmp468, align 4
				%tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
				store float %tmp154, float addrspace(1)* %tmp469, align 4
				%tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
				store float %tmp161, float addrspace(1)* %tmp470, align 4
				%tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
				store float %tmp168, float addrspace(1)* %tmp471, align 4
				%tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
				store float %tmp175, float addrspace(1)* %tmp472, align 4
				%tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
				store float %tmp182, float addrspace(1)* %tmp473, align 4
				%tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
				store float %tmp189, float addrspace(1)* %tmp474, align 4
				%tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
				store float %tmp196, float addrspace(1)* %tmp475, align 4
				%tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
				store float %tmp203, float addrspace(1)* %tmp476, align 4
				%tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
				store float %tmp210, float addrspace(1)* %tmp477, align 4
				%tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
				store float %tmp217, float addrspace(1)* %tmp478, align 4
				%tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
				store float %tmp224, float addrspace(1)* %tmp479, align 4
				%tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
				store float %tmp231, float addrspace(1)* %tmp480, align 4
				%tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
				store float %tmp238, float addrspace(1)* %tmp481, align 4
				%tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
				store float %tmp245, float addrspace(1)* %tmp482, align 4
				%tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
				store float %tmp252, float addrspace(1)* %tmp483, align 4
				%tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
				store float %tmp259, float addrspace(1)* %tmp484, align 4
				%tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
				store float %tmp266, float addrspace(1)* %tmp485, align 4
				%tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
				store float %tmp273, float addrspace(1)* %tmp486, align 4
				%tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
				store float %tmp280, float addrspace(1)* %tmp487, align 4
				%tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
				store float %tmp287, float addrspace(1)* %tmp488, align 4
				%tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
				store float %tmp294, float addrspace(1)* %tmp489, align 4
				%tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
				store float %tmp301, float addrspace(1)* %tmp490, align 4
				%tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
				store float %tmp308, float addrspace(1)* %tmp491, align 4
				%tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
				store float %tmp315, float addrspace(1)* %tmp492, align 4
				%tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
				store float %tmp322, float addrspace(1)* %tmp493, align 4
				%tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
				store float %tmp329, float addrspace(1)* %tmp494, align 4
				%tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
				store float %tmp336, float addrspace(1)* %tmp495, align 4
				%tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
				store float %tmp343, float addrspace(1)* %tmp496, align 4
				%tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
				store float %tmp350, float addrspace(1)* %tmp497, align 4
				%tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
				store float %tmp357, float addrspace(1)* %tmp498, align 4
				%tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
				store float %tmp364, float addrspace(1)* %tmp499, align 4
				%tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
				store float %tmp371, float addrspace(1)* %tmp500, align 4
				%tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
				store float %tmp378, float addrspace(1)* %tmp501, align 4
				%tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
				store float %tmp385, float addrspace(1)* %tmp502, align 4
				%tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
				store float %tmp392, float addrspace(1)* %tmp503, align 4
				%tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
				store float %tmp399, float addrspace(1)* %tmp504, align 4
				%tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
				store float %tmp406, float addrspace(1)* %tmp505, align 4
				%tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
				store float %tmp413, float addrspace(1)* %tmp506, align 4
				%tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
				store float %tmp420, float addrspace(1)* %tmp507, align 4
				%tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
				store float %tmp427, float addrspace(1)* %tmp508, align 4
				%tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
				store float %tmp434, float addrspace(1)* %tmp509, align 4
				%tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
				store float %tmp441, float addrspace(1)* %tmp510, align 4
				%tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
				store float %tmp448, float addrspace(1)* %tmp511, align 4
				ret void
				}

				; Function Attrs: nounwind readnone
				declare float @llvm.fmuladd.f32(float, float, float) #1

				attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
				attributes #1 = { nounwind readnone }