Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -323,6 +323,14 @@ return HasMadMixInsts; } + bool hasSBufferLoadStoreAtomicDwordxN() const { + // Only use the "x1" variants on GFX9 or don't use the buffer variants. + // For x2 and higher variants, if the accessed region spans 2 VM pages and + // the second page is unmapped, the hw hangs. + // TODO: There is one future GFX9 chip that doesn't have this bug. + return getGeneration() != GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -14,6 +14,12 @@ // ==> // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 // +// The same is done for certain SMEM opcodes, e.g.: +// s_buffer_load_dword s4, s[0:3], 4 +// s_buffer_load_dword s5, s[0:3], 8 +// ==> +// s_buffer_load_dwordx2 s[4:5], s[0:3], 4 +// // // Future improvements: // @@ -76,23 +82,28 @@ unsigned Offset0; unsigned Offset1; unsigned BaseOff; + bool GLC0; + bool GLC1; bool UseST64; + bool IsSBufferLoadImm; + bool IsX2; SmallVector InstsToMove; }; private: + const SISubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; + unsigned CreatedX2; static bool offsetsCanBeCombined(CombineInfo &CI); - bool findMatchingDSInst(CombineInfo &CI); - + bool findMatchingInst(CombineInfo &CI); MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); - MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); public: static char ID; @@ -210,6 +221,14 @@ CI.UseST64 = false; CI.BaseOff = 0; + // SMEM offsets must be consecutive. + if (CI.IsSBufferLoadImm) { + unsigned Diff = CI.IsX2 ? 2 : 1; + return (EltOffset0 + Diff == EltOffset1 || + EltOffset1 + Diff == EltOffset0) && + CI.GLC0 == CI.GLC1; + } + // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && @@ -247,13 +266,18 @@ return false; } -bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { +bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::addr); + unsigned AddrOpName; + if (CI.IsSBufferLoadImm) + AddrOpName = AMDGPU::OpName::sbase; + else + AddrOpName = AMDGPU::OpName::addr; + + int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName); const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); // We only ever merge operations with the same base address register, so don't @@ -319,10 +343,18 @@ AddrReg0.getSubReg() == AddrReg1.getSubReg()) { int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); CI.Paired = MBBI; + if (CI.IsSBufferLoadImm) { + CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); + CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); + } else { + CI.Offset0 &= 0xffff; + CI.Offset1 &= 0xffff; + } + // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged @@ -488,6 +520,51 @@ return Next; } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : + AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. @@ -505,10 +582,11 @@ CombineInfo CI; CI.I = I; + CI.IsSBufferLoadImm = false; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { @@ -516,9 +594,10 @@ } continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + } + if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { @@ -527,6 +606,23 @@ continue; } + if (STM->hasSBufferLoadStoreAtomicDwordxN() && + (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { + // EltSize is in units of the offset encoding. + CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); + CI.IsSBufferLoadImm = true; + CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeSBufferLoadImmPair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } ++I; } @@ -538,11 +634,11 @@ if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &STM = MF.getSubtarget(); - if (!STM.loadStoreOptEnabled()) + STM = &MF.getSubtarget(); + if (!STM->loadStoreOptEnabled()) return false; - TII = STM.getInstrInfo(); + TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -553,9 +649,16 @@ DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; + CreatedX2 = 0; for (MachineBasicBlock &MBB : MF) Modified |= optimizeBlock(MBB); + // Run again to convert x2 to x4. + if (CreatedX2 >= 1) { + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + } + return Modified; } Index: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll @@ -1,11 +1,12 @@ -; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 @@ -17,7 +18,7 @@ ; SMRD load with the largest possible immediate offset. ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 @@ -31,7 +32,7 @@ ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: @@ -61,7 +62,7 @@ ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 @@ -72,8 +73,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd5: -; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { @@ -104,7 +105,7 @@ ; SMRD load using the load.const.v4i32 intrinsic with an immediate offset ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 @@ -118,7 +119,7 @@ ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 @@ -135,7 +136,7 @@ ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 @@ -150,7 +151,7 @@ ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 @@ -162,8 +163,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: -; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { @@ -212,6 +213,31 @@ ret float %r } +; GCN-LABEL: {{^}}smrd_imm_merged: +; GCN-NEXT: BB# +; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1 +; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7 +; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4 +; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}} +define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 { +main_body: + %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4) + %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8) + %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12) + %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16) + %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28) + %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 + ret void +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1