Skip to content

Commit ce095c5

Browse files
author
Farhana Aleen
committedDec 14, 2018
[AMDGPU] Promote constant offset to the immediate by finding a new base with 13bit constant offset from the nearby instructions.
Summary: Promote constant offset to immediate by recomputing the relative 13bit offset from nearby instructions. E.g. s_movk_i32 s0, 0x1800 v_add_co_u32_e32 v0, vcc, s0, v2 v_addc_co_u32_e32 v1, vcc, 0, v6, vcc s_movk_i32 s0, 0x1000 v_add_co_u32_e32 v5, vcc, s0, v2 v_addc_co_u32_e32 v6, vcc, 0, v6, vcc global_load_dwordx2 v[5:6], v[5:6], off global_load_dwordx2 v[0:1], v[0:1], off => s_movk_i32 s0, 0x1000 v_add_co_u32_e32 v5, vcc, s0, v2 v_addc_co_u32_e32 v6, vcc, 0, v6, vcc global_load_dwordx2 v[5:6], v[5:6], off global_load_dwordx2 v[0:1], v[5:6], off offset:2048 Author: FarhanaAleen Reviewed By: arsenm, rampitec Subscribers: llvm-commits, AMDGPU Differential Revision: https://reviews.llvm.org/D55539 llvm-svn: 349196
1 parent 2618750 commit ce095c5

File tree

4 files changed

+1001
-1
lines changed

4 files changed

+1001
-1
lines changed
 

‎llvm/lib/Target/AMDGPU/SIISelLowering.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
170170
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
171171

172172
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
173-
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
174173
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
175174

176175
unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -212,6 +211,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
212211
SmallVectorImpl<Value*> &/*Ops*/,
213212
Type *&/*AccessTy*/) const override;
214213

214+
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
215215
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
216216
unsigned AS,
217217
Instruction *I = nullptr) const override;

‎llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

+361
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@
2020
// ==>
2121
// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
2222
//
23+
// This pass also tries to promote constant offset to the immediate by
24+
// adjusting the base. It tries to use a base from the nearby instructions that
25+
// allows it to have a 13bit constant offset and then promotes the 13bit offset
26+
// to the immediate.
27+
// E.g.
28+
// s_movk_i32 s0, 0x1800
29+
// v_add_co_u32_e32 v0, vcc, s0, v2
30+
// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31+
//
32+
// s_movk_i32 s0, 0x1000
33+
// v_add_co_u32_e32 v5, vcc, s0, v2
34+
// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
35+
// global_load_dwordx2 v[5:6], v[5:6], off
36+
// global_load_dwordx2 v[0:1], v[0:1], off
37+
// =>
38+
// s_movk_i32 s0, 0x1000
39+
// v_add_co_u32_e32 v5, vcc, s0, v2
40+
// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
41+
// global_load_dwordx2 v[5:6], v[5:6], off
42+
// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
2343
//
2444
// Future improvements:
2545
//
@@ -116,6 +136,21 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
116136
SmallVector<MachineInstr *, 8> InstsToMove;
117137
};
118138

139+
struct BaseRegisters {
140+
unsigned LoReg = 0;
141+
unsigned HiReg = 0;
142+
143+
unsigned LoSubReg = 0;
144+
unsigned HiSubReg = 0;
145+
};
146+
147+
struct MemAddress {
148+
BaseRegisters Base;
149+
int64_t Offset = 0;
150+
};
151+
152+
using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
153+
119154
private:
120155
const GCNSubtarget *STM = nullptr;
121156
const SIInstrInfo *TII = nullptr;
@@ -146,6 +181,19 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
146181
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
147182
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
148183

184+
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
185+
int32_t NewOffset);
186+
unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
187+
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
188+
Optional<int32_t> extractConstOffset(const MachineOperand &Op);
189+
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
190+
/// Promotes constant offset to the immediate by adjusting the base. It
191+
/// tries to use a base from the nearby instructions that allows it to have
192+
/// a 13bit constant offset which gets promoted to the immediate.
193+
bool promoteConstantOffsetToImm(MachineInstr &CI,
194+
MemInfoMap &Visited,
195+
SmallPtrSet<MachineInstr *, 4> &Promoted);
196+
149197
public:
150198
static char ID;
151199

@@ -1053,15 +1101,328 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
10531101
return Next;
10541102
}
10551103

1104+
MachineOperand
1105+
SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1106+
APInt V(32, Val, true);
1107+
if (TII->isInlineConstant(V))
1108+
return MachineOperand::CreateImm(Val);
1109+
1110+
unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1111+
MachineInstr *Mov =
1112+
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1113+
TII->get(AMDGPU::S_MOV_B32), Reg)
1114+
.addImm(Val);
1115+
LLVM_DEBUG(dbgs() << " "; Mov->dump());
1116+
return MachineOperand::CreateReg(Reg, false);
1117+
}
1118+
1119+
// Compute base address using Addr and return the final register.
1120+
unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1121+
const MemAddress &Addr) {
1122+
MachineBasicBlock *MBB = MI.getParent();
1123+
MachineBasicBlock::iterator MBBI = MI.getIterator();
1124+
DebugLoc DL = MI.getDebugLoc();
1125+
1126+
assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1127+
Addr.Base.LoSubReg) &&
1128+
"Expected 32-bit Base-Register-Low!!");
1129+
1130+
assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1131+
Addr.Base.HiSubReg) &&
1132+
"Expected 32-bit Base-Register-Hi!!");
1133+
1134+
LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1135+
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1136+
MachineOperand OffsetHi =
1137+
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1138+
unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1139+
unsigned DeadCarryReg =
1140+
MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1141+
1142+
unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1143+
unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1144+
MachineInstr *LoHalf =
1145+
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1146+
.addReg(CarryReg, RegState::Define)
1147+
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1148+
.add(OffsetLo);
1149+
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1150+
1151+
MachineInstr *HiHalf =
1152+
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1153+
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1154+
.addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1155+
.add(OffsetHi)
1156+
.addReg(CarryReg, RegState::Kill);
1157+
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1158+
1159+
unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1160+
MachineInstr *FullBase =
1161+
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1162+
.addReg(DestSub0)
1163+
.addImm(AMDGPU::sub0)
1164+
.addReg(DestSub1)
1165+
.addImm(AMDGPU::sub1);
1166+
LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1167+
1168+
return FullDestReg;
1169+
}
1170+
1171+
// Update base and offset with the NewBase and NewOffset in MI.
1172+
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1173+
unsigned NewBase,
1174+
int32_t NewOffset) {
1175+
TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1176+
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1177+
}
1178+
1179+
Optional<int32_t>
1180+
SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1181+
if (Op.isImm())
1182+
return Op.getImm();
1183+
1184+
if (!Op.isReg())
1185+
return None;
1186+
1187+
MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1188+
if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1189+
!Def->getOperand(1).isImm())
1190+
return None;
1191+
1192+
return Def->getOperand(1).getImm();
1193+
}
1194+
1195+
// Analyze Base and extracts:
1196+
// - 32bit base registers, subregisters
1197+
// - 64bit constant offset
1198+
// Expecting base computation as:
1199+
// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1200+
// %LO:vgpr_32, %c:sreg_64_xexec =
1201+
// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1202+
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1203+
// %Base:vreg_64 =
1204+
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1205+
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1206+
MemAddress &Addr) {
1207+
if (!Base.isReg())
1208+
return;
1209+
1210+
MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1211+
if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1212+
|| Def->getNumOperands() != 5)
1213+
return;
1214+
1215+
MachineOperand BaseLo = Def->getOperand(1);
1216+
MachineOperand BaseHi = Def->getOperand(3);
1217+
if (!BaseLo.isReg() || !BaseHi.isReg())
1218+
return;
1219+
1220+
MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1221+
MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1222+
1223+
if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1224+
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1225+
return;
1226+
1227+
const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1228+
const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1229+
1230+
auto Offset0P = extractConstOffset(*Src0);
1231+
if (Offset0P)
1232+
BaseLo = *Src1;
1233+
else {
1234+
if (!(Offset0P = extractConstOffset(*Src1)))
1235+
return;
1236+
BaseLo = *Src0;
1237+
}
1238+
1239+
Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1240+
Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1241+
1242+
if (Src0->isImm())
1243+
std::swap(Src0, Src1);
1244+
1245+
if (!Src1->isImm())
1246+
return;
1247+
1248+
assert(isInt<32>(*Offset0P) && isInt<32>(Src1->getImm())
1249+
&& "Expected 32bit immediate!!!");
1250+
uint64_t Offset1 = Src1->getImm();
1251+
BaseHi = *Src0;
1252+
1253+
Addr.Base.LoReg = BaseLo.getReg();
1254+
Addr.Base.HiReg = BaseHi.getReg();
1255+
Addr.Base.LoSubReg = BaseLo.getSubReg();
1256+
Addr.Base.HiSubReg = BaseHi.getSubReg();
1257+
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1258+
}
1259+
1260+
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1261+
MachineInstr &MI,
1262+
MemInfoMap &Visited,
1263+
SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1264+
1265+
// TODO: Support flat and scratch.
1266+
if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1267+
TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1268+
return false;
1269+
1270+
// TODO: Support Store.
1271+
if (!MI.mayLoad())
1272+
return false;
1273+
1274+
if (AnchorList.count(&MI))
1275+
return false;
1276+
1277+
LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1278+
1279+
if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1280+
LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1281+
return false;
1282+
}
1283+
1284+
// Step1: Find the base-registers and a 64bit constant offset.
1285+
MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1286+
MemAddress MAddr;
1287+
if (Visited.find(&MI) == Visited.end()) {
1288+
processBaseWithConstOffset(Base, MAddr);
1289+
Visited[&MI] = MAddr;
1290+
} else
1291+
MAddr = Visited[&MI];
1292+
1293+
if (MAddr.Offset == 0) {
1294+
LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1295+
" constant offsets that can be promoted.\n";);
1296+
return false;
1297+
}
1298+
1299+
LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1300+
<< MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1301+
1302+
// Step2: Traverse through MI's basic block and find an anchor(that has the
1303+
// same base-registers) with the highest 13bit distance from MI's offset.
1304+
// E.g. (64bit loads)
1305+
// bb:
1306+
// addr1 = &a + 4096; load1 = load(addr1, 0)
1307+
// addr2 = &a + 6144; load2 = load(addr2, 0)
1308+
// addr3 = &a + 8192; load3 = load(addr3, 0)
1309+
// addr4 = &a + 10240; load4 = load(addr4, 0)
1310+
// addr5 = &a + 12288; load5 = load(addr5, 0)
1311+
//
1312+
// Starting from the first load, the optimization will try to find a new base
1313+
// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1314+
// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1315+
// as the new-base(anchor) because of the maximum distance which can
1316+
// accomodate more intermediate bases presumeably.
1317+
//
1318+
// Step3: move (&a + 8192) above load1. Compute and promote offsets from
1319+
// (&a + 8192) for load1, load2, load4.
1320+
// addr = &a + 8192
1321+
// load1 = load(addr, -4096)
1322+
// load2 = load(addr, -2048)
1323+
// load3 = load(addr, 0)
1324+
// load4 = load(addr, 2048)
1325+
// addr5 = &a + 12288; load5 = load(addr5, 0)
1326+
//
1327+
MachineInstr *AnchorInst = nullptr;
1328+
MemAddress AnchorAddr;
1329+
uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1330+
SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1331+
1332+
MachineBasicBlock *MBB = MI.getParent();
1333+
MachineBasicBlock::iterator E = MBB->end();
1334+
MachineBasicBlock::iterator MBBI = MI.getIterator();
1335+
++MBBI;
1336+
const SITargetLowering *TLI =
1337+
static_cast<const SITargetLowering *>(STM->getTargetLowering());
1338+
1339+
for ( ; MBBI != E; ++MBBI) {
1340+
MachineInstr &MINext = *MBBI;
1341+
// TODO: Support finding an anchor(with same base) from store addresses or
1342+
// any other load addresses where the opcodes are different.
1343+
if (MINext.getOpcode() != MI.getOpcode() ||
1344+
TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1345+
continue;
1346+
1347+
const MachineOperand &BaseNext =
1348+
*TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1349+
MemAddress MAddrNext;
1350+
if (Visited.find(&MINext) == Visited.end()) {
1351+
processBaseWithConstOffset(BaseNext, MAddrNext);
1352+
Visited[&MINext] = MAddrNext;
1353+
} else
1354+
MAddrNext = Visited[&MINext];
1355+
1356+
if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1357+
MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1358+
MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1359+
MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1360+
continue;
1361+
1362+
InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1363+
1364+
int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1365+
TargetLoweringBase::AddrMode AM;
1366+
AM.HasBaseReg = true;
1367+
AM.BaseOffs = Dist;
1368+
if (TLI->isLegalGlobalAddressingMode(AM) &&
1369+
(uint32_t)abs(Dist) > MaxDist) {
1370+
MaxDist = abs(Dist);
1371+
1372+
AnchorAddr = MAddrNext;
1373+
AnchorInst = &MINext;
1374+
}
1375+
}
1376+
1377+
if (AnchorInst) {
1378+
LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1379+
AnchorInst->dump());
1380+
LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1381+
<< AnchorAddr.Offset << "\n\n");
1382+
1383+
// Instead of moving up, just re-compute anchor-instruction's base address.
1384+
unsigned Base = computeBase(MI, AnchorAddr);
1385+
1386+
updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1387+
LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1388+
1389+
for (auto P : InstsWCommonBase) {
1390+
TargetLoweringBase::AddrMode AM;
1391+
AM.HasBaseReg = true;
1392+
AM.BaseOffs = P.second - AnchorAddr.Offset;
1393+
1394+
if (TLI->isLegalGlobalAddressingMode(AM)) {
1395+
LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1396+
dbgs() << ")"; P.first->dump());
1397+
updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1398+
LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1399+
}
1400+
}
1401+
AnchorList.insert(AnchorInst);
1402+
return true;
1403+
}
1404+
1405+
return false;
1406+
}
1407+
10561408
// Scan through looking for adjacent LDS operations with constant offsets from
10571409
// the same base register. We rely on the scheduler to do the hard work of
10581410
// clustering nearby loads, and assume these are all adjacent.
10591411
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
10601412
bool Modified = false;
10611413

1414+
// Contain the list
1415+
MemInfoMap Visited;
1416+
// Contains the list of instructions for which constant offsets are being
1417+
// promoted to the IMM.
1418+
SmallPtrSet<MachineInstr *, 4> AnchorList;
1419+
10621420
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
10631421
MachineInstr &MI = *I;
10641422

1423+
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1424+
Modified = true;
1425+
10651426
// Don't combine if volatile.
10661427
if (MI.hasOrderedMemoryRef()) {
10671428
++I;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,485 @@
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3+
4+
declare i64 @_Z13get_global_idj(i32)
5+
6+
define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
7+
; GCN-LABEL: clmem_read_simplified:
8+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
9+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
10+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16+
;
17+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
18+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
19+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
20+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
21+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
22+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
24+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
25+
entry:
26+
%call = tail call i64 @_Z13get_global_idj(i32 0)
27+
%conv = and i64 %call, 255
28+
%a0 = shl i64 %call, 7
29+
%idx.ext11 = and i64 %a0, 4294934528
30+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
31+
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
32+
33+
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
34+
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
35+
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
36+
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
37+
%add.1 = add i64 %load2, %load1
38+
39+
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
40+
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
41+
%add.2 = add i64 %load3, %add.1
42+
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
43+
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
44+
%add.3 = add i64 %load4, %add.2
45+
46+
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
47+
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
48+
%add.4 = add i64 %load5, %add.3
49+
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
50+
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
51+
%add.5 = add i64 %load6, %add.4
52+
53+
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
54+
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
55+
%add.6 = add i64 %load7, %add.5
56+
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
57+
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
58+
%add.7 = add i64 %load8, %add.6
59+
60+
store i64 %add.7, i64 addrspace(1)* %saddr, align 8
61+
ret void
62+
}
63+
64+
define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
65+
; GCN-LABEL: clmem_read:
66+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
67+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
68+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
69+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
70+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
71+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
72+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
73+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
74+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
75+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
76+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
77+
;
78+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
79+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
80+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
81+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
82+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
83+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
84+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
85+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
86+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
87+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
88+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
89+
entry:
90+
%call = tail call i64 @_Z13get_global_idj(i32 0)
91+
%conv = and i64 %call, 255
92+
%a0 = shl i64 %call, 17
93+
%idx.ext11 = and i64 %a0, 4261412864
94+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
95+
%a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
96+
%add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
97+
br label %for.cond.preheader
98+
99+
while.cond.loopexit: ; preds = %for.body
100+
%dec = add nsw i32 %dec31, -1
101+
%tobool = icmp eq i32 %dec31, 0
102+
br i1 %tobool, label %while.end, label %for.cond.preheader
103+
104+
for.cond.preheader: ; preds = %entry, %while.cond.loopexit
105+
%dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
106+
%sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
107+
br label %for.body
108+
109+
for.body: ; preds = %for.body, %for.cond.preheader
110+
%block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
111+
%sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
112+
%conv3 = zext i32 %block.029 to i64
113+
%add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
114+
%load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
115+
%add = add i64 %load1, %sum.128
116+
117+
%add9 = or i32 %block.029, 256
118+
%conv3.1 = zext i32 %add9 to i64
119+
%add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
120+
%load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
121+
%add.1 = add i64 %load2, %add
122+
123+
%add9.1 = or i32 %block.029, 512
124+
%conv3.2 = zext i32 %add9.1 to i64
125+
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
126+
%l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
127+
%add.2 = add i64 %l3, %add.1
128+
129+
%add9.2 = or i32 %block.029, 768
130+
%conv3.3 = zext i32 %add9.2 to i64
131+
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
132+
%l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
133+
%add.3 = add i64 %l4, %add.2
134+
135+
%add9.3 = or i32 %block.029, 1024
136+
%conv3.4 = zext i32 %add9.3 to i64
137+
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
138+
%l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
139+
%add.4 = add i64 %l5, %add.3
140+
141+
%add9.4 = or i32 %block.029, 1280
142+
%conv3.5 = zext i32 %add9.4 to i64
143+
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
144+
%l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
145+
%add.5 = add i64 %l6, %add.4
146+
147+
%add9.5 = or i32 %block.029, 1536
148+
%conv3.6 = zext i32 %add9.5 to i64
149+
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
150+
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
151+
%add.6 = add i64 %load7, %add.5
152+
153+
%add9.6 = or i32 %block.029, 1792
154+
%conv3.7 = zext i32 %add9.6 to i64
155+
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
156+
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
157+
%add.7 = add i64 %load8, %add.6
158+
159+
%add9.7 = or i32 %block.029, 2048
160+
%conv3.8 = zext i32 %add9.7 to i64
161+
%add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
162+
%load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
163+
%add.8 = add i64 %load9, %add.7
164+
165+
%add9.8 = or i32 %block.029, 2304
166+
%conv3.9 = zext i32 %add9.8 to i64
167+
%add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
168+
%load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
169+
%add.9 = add i64 %load10, %add.8
170+
171+
%add9.9 = or i32 %block.029, 2560
172+
%conv3.10 = zext i32 %add9.9 to i64
173+
%add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
174+
%load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
175+
%add.10 = add i64 %load11, %add.9
176+
177+
%add9.31 = add nuw nsw i32 %block.029, 8192
178+
%cmp.31 = icmp ult i32 %add9.31, 4194304
179+
br i1 %cmp.31, label %for.body, label %while.cond.loopexit
180+
181+
while.end: ; preds = %while.cond.loopexit
182+
store i64 %add.10, i64 addrspace(1)* %a1, align 8
183+
ret void
184+
}
185+
186+
; using 32bit address.
187+
define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
188+
; GCN-LABEL: Address32:
189+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
190+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
191+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
192+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
193+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
194+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
195+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
196+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
197+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
198+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
199+
;
200+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
201+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
202+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
203+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
204+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
205+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
206+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
207+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
208+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
209+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
210+
entry:
211+
%call = tail call i64 @_Z13get_global_idj(i32 0)
212+
%conv = and i64 %call, 255
213+
%id = shl i64 %call, 7
214+
%idx.ext11 = and i64 %id, 4294934528
215+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
216+
%addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
217+
218+
%add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
219+
%load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
220+
221+
%add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
222+
%load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
223+
%add.1 = add i32 %load2, %load1
224+
225+
%add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
226+
%load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
227+
%add.2 = add i32 %load3, %add.1
228+
229+
%add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
230+
%load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
231+
%add.3 = add i32 %load4, %add.2
232+
233+
%add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
234+
%load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
235+
%add.4 = add i32 %load5, %add.3
236+
237+
%add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
238+
%load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
239+
%add.5 = add i32 %load6, %add.4
240+
241+
%add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
242+
%load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
243+
%add.6 = add i32 %load7, %add.5
244+
245+
%add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
246+
%load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
247+
%add.7 = add i32 %load8, %add.6
248+
249+
%add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
250+
%load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
251+
%add.8 = add i32 %load9, %add.7
252+
253+
%add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
254+
%load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
255+
%add.9 = add i32 %load10, %add.8
256+
257+
store i32 %add.9, i32 addrspace(1)* %addr, align 4
258+
ret void
259+
}
260+
261+
define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
262+
; GCN-LABEL: Offset64:
263+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
264+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
265+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
266+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
267+
;
268+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
269+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
270+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
271+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
272+
entry:
273+
%call = tail call i64 @_Z13get_global_idj(i32 0)
274+
%conv = and i64 %call, 255
275+
%a0 = shl i64 %call, 7
276+
%idx.ext11 = and i64 %a0, 4294934528
277+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
278+
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
279+
280+
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
281+
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
282+
283+
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
284+
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
285+
286+
%add1 = add i64 %load2, %load1
287+
288+
%addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
289+
%load3 = load i64, i64 addrspace(1)* %addr3, align 8
290+
291+
%add2 = add i64 %load3, %add1
292+
293+
%addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
294+
%load4 = load i64, i64 addrspace(1)* %addr4, align 8
295+
%add4 = add i64 %load4, %add2
296+
297+
store i64 %add4, i64 addrspace(1)* %saddr, align 8
298+
ret void
299+
}
300+
301+
; TODO: Support load4 as anchor instruction.
302+
define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
303+
; GCN-LABEL: p32Offset64:
304+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
305+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
306+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
307+
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
308+
;
309+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
310+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
311+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
312+
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
313+
entry:
314+
%call = tail call i64 @_Z13get_global_idj(i32 0)
315+
%conv = and i64 %call, 255
316+
%a0 = shl i64 %call, 7
317+
%idx.ext11 = and i64 %a0, 4294934528
318+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
319+
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
320+
321+
%addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
322+
%load1 = load i32, i32 addrspace(1)* %addr1, align 8
323+
324+
%addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
325+
%load2 = load i32, i32 addrspace(1)* %addr2, align 8
326+
327+
%add1 = add i32 %load2, %load1
328+
329+
%addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
330+
%load3 = load i32, i32 addrspace(1)* %addr3, align 8
331+
332+
%add2 = add i32 %load3, %add1
333+
334+
%addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
335+
%load4 = load i32, i32 addrspace(1)* %addr4, align 8
336+
%add4 = add i32 %load4, %add2
337+
338+
store i32 %add4, i32 addrspace(1)* %saddr, align 8
339+
ret void
340+
}
341+
342+
define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
343+
; GCN-LABEL: DiffBase:
344+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
345+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
346+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
347+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
348+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
349+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
350+
;
351+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
352+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
353+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
354+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
355+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
356+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
357+
i8 addrspace(1)* %buffer2) {
358+
entry:
359+
%call = tail call i64 @_Z13get_global_idj(i32 0)
360+
%conv = and i64 %call, 255
361+
%a0 = shl i64 %call, 7
362+
%idx.ext11 = and i64 %a0, 4294934528
363+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
364+
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
365+
366+
%add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
367+
%saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
368+
369+
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
370+
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
371+
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
372+
%load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
373+
%add1 = add i64 %load2, %load1
374+
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
375+
%load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
376+
%add2 = add i64 %load3, %add1
377+
378+
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
379+
%load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
380+
381+
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
382+
%load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
383+
%add3 = add i64 %load5, %load4
384+
385+
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
386+
%load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
387+
%add4 = add i64 %load6, %add3
388+
389+
%add5 = add i64 %add2, %add4
390+
391+
store i64 %add5, i64 addrspace(1)* %saddr, align 8
392+
ret void
393+
}
394+
395+
define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
396+
; GCN-LABEL: ReverseOrder:
397+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
398+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
399+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
400+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
401+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
402+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
405+
;
406+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
407+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
408+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
409+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
410+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
412+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
413+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
414+
entry:
415+
%call = tail call i64 @_Z13get_global_idj(i32 0)
416+
%conv = and i64 %call, 255
417+
%a0 = shl i64 %call, 7
418+
%idx.ext11 = and i64 %a0, 4294934528
419+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
420+
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
421+
422+
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
423+
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
424+
425+
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
426+
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
427+
%add7 = add i64 %load8, %load1
428+
429+
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
430+
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
431+
%add6 = add i64 %load7, %add7
432+
433+
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
434+
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
435+
%add5 = add i64 %load6, %add6
436+
437+
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
438+
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
439+
%add4 = add i64 %load5, %add5
440+
441+
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
442+
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
443+
%add3 = add i64 %load4, %add4
444+
445+
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
446+
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
447+
%add2 = add i64 %load3, %add3
448+
449+
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
450+
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
451+
%add1 = add i64 %load2, %add2
452+
453+
store i64 %add1, i64 addrspace(1)* %saddr, align 8
454+
ret void
455+
}
456+
457+
define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
458+
; GCN-LABEL: negativeoffset:
459+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
460+
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
461+
;
462+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
463+
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
464+
entry:
465+
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
466+
%conv = and i64 %call, 255
467+
%0 = shl i64 %call, 7
468+
%idx.ext11 = and i64 %0, 4294934528
469+
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
470+
%buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
471+
472+
%buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
473+
474+
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
475+
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
476+
477+
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
478+
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
479+
480+
481+
%add = add i64 %load2, %load1
482+
483+
store i64 %add, i64 addrspace(1)* %buffer_head, align 8
484+
ret void
485+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
2+
3+
# GFX9-LABEL: name: diffoporder_add
4+
# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0, 0
5+
# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0, 0
6+
7+
name: diffoporder_add
8+
body: |
9+
bb.0.entry:
10+
%0:sgpr_64 = COPY $sgpr0_sgpr1
11+
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
12+
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
13+
%4:sreg_32_xm0 = COPY $sgpr101
14+
%5:sreg_32_xm0 = S_MOV_B32 0
15+
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
16+
$sgpr4 = COPY %4
17+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
18+
%6:vreg_64 = COPY $vgpr0_vgpr1
19+
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
20+
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
21+
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
22+
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
23+
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
24+
%12:sgpr_32 = COPY %1.sub1
25+
%13:vgpr_32 = COPY %5
26+
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
27+
%16:vgpr_32 = COPY %12
28+
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
29+
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
30+
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
31+
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
32+
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
33+
%25:sgpr_32 = S_MOV_B32 4096
34+
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %25, %21, implicit $exec
35+
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
36+
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
37+
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
38+
%32:sgpr_32 = S_MOV_B32 6144
39+
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
40+
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
41+
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
42+
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
43+
...
44+
---
45+
46+
# GFX9-LABEL: name: LowestInMiddle
47+
# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200
48+
# GFX9: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
49+
# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
50+
# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
51+
# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0, 0
52+
#
53+
# GFX9: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400
54+
# GFX9: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]]
55+
# GFX9: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_7]]
56+
# GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1
57+
# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, 0,
58+
# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0,
59+
60+
name: LowestInMiddle
61+
body: |
62+
bb.0.entry:
63+
%0:sgpr_64 = COPY $sgpr0_sgpr1
64+
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
65+
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
66+
%4:sreg_32_xm0 = COPY $sgpr101
67+
%5:sreg_32_xm0 = S_MOV_B32 0
68+
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
69+
$sgpr4 = COPY %4
70+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
71+
%6:vreg_64 = COPY $vgpr0_vgpr1
72+
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
73+
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
74+
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
75+
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
76+
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
77+
%12:sgpr_32 = COPY %1.sub1
78+
%13:vgpr_32 = COPY %5
79+
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
80+
%16:vgpr_32 = COPY %12
81+
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
82+
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
83+
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
84+
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
85+
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
86+
%25:sgpr_32 = S_MOV_B32 8000
87+
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
88+
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
89+
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
90+
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
91+
%32:sgpr_32 = S_MOV_B32 6400
92+
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
93+
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
94+
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
95+
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
96+
%39:sgpr_32 = S_MOV_B32 11200
97+
%40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
98+
%42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
99+
%44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
100+
%45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
101+
...
102+
---
103+
104+
# GFX9-LABEL: name: NegativeDistance
105+
# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240
106+
# GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
107+
# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
108+
# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
109+
# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0, 0
110+
# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0
111+
# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0
112+
113+
name: NegativeDistance
114+
body: |
115+
bb.0.entry:
116+
%0:sgpr_64 = COPY $sgpr0_sgpr1
117+
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
118+
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
119+
%4:sreg_32_xm0 = COPY $sgpr101
120+
%5:sreg_32_xm0 = S_MOV_B32 0
121+
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
122+
$sgpr4 = COPY %4
123+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
124+
%6:vreg_64 = COPY $vgpr0_vgpr1
125+
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
126+
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
127+
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
128+
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
129+
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
130+
%12:sgpr_32 = COPY %1.sub1
131+
%13:vgpr_32 = COPY %5
132+
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
133+
%16:vgpr_32 = COPY %12
134+
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
135+
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
136+
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
137+
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
138+
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
139+
%25:sgpr_32 = S_MOV_B32 6144
140+
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
141+
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
142+
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
143+
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
144+
%32:sgpr_32 = S_MOV_B32 8192
145+
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
146+
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
147+
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
148+
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
149+
%39:sgpr_32 = S_MOV_B32 10240
150+
%40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
151+
%42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
152+
%44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
153+
%45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
154+
...

0 commit comments

Comments
 (0)
Please sign in to comment.