Index: llvm/trunk/lib/CodeGen/LiveIntervalAnalysis.cpp =================================================================== --- llvm/trunk/lib/CodeGen/LiveIntervalAnalysis.cpp +++ llvm/trunk/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -506,20 +506,19 @@ // Visit all instructions reading Reg. SlotIndex LastIdx; - for (MachineOperand &MO : MRI->reg_operands(Reg)) { - MachineInstr *UseMI = MO.getParent(); - if (UseMI->isDebugValue() || !MO.readsReg()) + for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { + // Skip "undef" uses. + if (!MO.readsReg()) continue; // Maybe the operand is for a subregister we don't care about. unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg); - if (MO.isDef()) - LaneMask = ~LaneMask & MRI->getMaxLaneMaskForVReg(Reg); if ((LaneMask & SR.LaneMask) == 0) continue; } // We only need to visit each instruction once. + MachineInstr *UseMI = MO.getParent(); SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); if (Idx == LastIdx) continue; Index: llvm/trunk/lib/CodeGen/LiveRangeCalc.h =================================================================== --- llvm/trunk/lib/CodeGen/LiveRangeCalc.h +++ llvm/trunk/lib/CodeGen/LiveRangeCalc.h @@ -160,6 +160,7 @@ /// all uses must be jointly dominated by the definitions from @p LR /// together with definitions of other lanes where @p LR becomes undefined /// (via operands). + /// If @p LR is a main range, the @p LaneMask should be set to ~0. void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask, LiveInterval *LI = nullptr); Index: llvm/trunk/lib/CodeGen/LiveRangeCalc.cpp =================================================================== --- llvm/trunk/lib/CodeGen/LiveRangeCalc.cpp +++ llvm/trunk/lib/CodeGen/LiveRangeCalc.cpp @@ -163,13 +163,18 @@ LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes); // Visit all operands that read Reg. This may include partial defs. + bool IsSubRange = (Mask != ~0U); const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { // Clear all kill flags. They will be reinserted after register allocation // by LiveIntervalAnalysis::addKillFlags(). if (MO.isUse()) MO.setIsKill(false); - if (!MO.readsReg()) + // MO::readsReg returns "true" for subregister defs. This is for keeping + // liveness of the entire register (i.e. for the main range of the live + // interval). For subranges, definitions of non-overlapping subregisters + // do not count as uses. + if (!MO.readsReg() || (IsSubRange && MO.isDef())) continue; unsigned SubReg = MO.getSubReg(); Index: llvm/trunk/lib/CodeGen/SplitKit.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SplitKit.cpp +++ llvm/trunk/lib/CodeGen/SplitKit.cpp @@ -1206,7 +1206,8 @@ // defining the register. This is because a operand // will create an "undef" point, and we cannot extend any subranges // until all of them have been accounted for. - ExtPoints.push_back(ExtPoint(MO, RegIdx, Next)); + if (MO.isUse()) + ExtPoints.push_back(ExtPoint(MO, RegIdx, Next)); } else { LiveRangeCalc &LRC = getLRCalc(RegIdx); LRC.extend(LI, Next, 0, ArrayRef()); @@ -1221,10 +1222,6 @@ unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg(); LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub) : MRI.getMaxLaneMaskForVReg(Reg); - // If this is a non-read-undef definition of a sub-register, extend - // subranges for everything except that sub-register. - if (Sub != 0 && EP.MO.isDef()) - LM = MRI.getMaxLaneMaskForVReg(Reg) & ~LM; for (LiveInterval::SubRange &S : LI.subranges()) { if (!(S.LaneMask & LM)) continue; Index: llvm/trunk/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll +++ llvm/trunk/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; REQUIRES: asserts +; +; This testcase used to cause the following crash: +; +; *** Couldn't join subrange! +; +; UNREACHABLE executed at lib/CodeGen/RegisterCoalescer.cpp:2666! +; +; The insertelement instructions became subregister definitions: one virtual +; register was defined and re-defined by one group of the consecutive insert- +; elements, and another was defined by the second group. +; Since a copy between the two full registers was present in the program, +; the coalescer tried to merge them. The join algorithm for the main range +; decided that it was correct to do so, while the subrange join unexpectedly +; failed. This was caused by the live interval subranges not being computed +; correctly: subregister defs are not uses for the purpose of subranges. +; +; Test for a valid output: +; CHECK: image_sample_c_d_o + +target triple = "amdgcn--" + +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 { +main_body: + %tmp = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg6, <2 x i32> %arg8) + %tmp23 = fadd float %tmp, 0xBFA99999A0000000 + %tmp24 = fadd float %tmp, 0x3FA99999A0000000 + %tmp25 = bitcast float %tmp23 to i32 + %tmp26 = insertelement <16 x i32> , i32 %tmp25, i32 1 + %tmp27 = insertelement <16 x i32> %tmp26, i32 undef, i32 2 + %tmp28 = insertelement <16 x i32> %tmp27, i32 undef, i32 3 + %tmp29 = insertelement <16 x i32> %tmp28, i32 undef, i32 4 + %tmp30 = insertelement <16 x i32> %tmp29, i32 0, i32 5 + %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6 + %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7 + %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8 + %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp35 = extractelement <4 x float> %tmp34, i32 0 + %tmp36 = bitcast float %tmp24 to i32 + %tmp37 = insertelement <16 x i32> , i32 %tmp36, i32 1 + %tmp38 = insertelement <16 x i32> %tmp37, i32 undef, i32 2 + %tmp39 = insertelement <16 x i32> %tmp38, i32 undef, i32 3 + %tmp40 = insertelement <16 x i32> %tmp39, i32 undef, i32 4 + %tmp41 = insertelement <16 x i32> %tmp40, i32 0, i32 5 + %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6 + %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7 + %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8 + %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp46 = extractelement <4 x float> %tmp45, i32 0 + %tmp47 = fmul float %tmp35, %tmp46 + %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14 + %tmp49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp48, float %arg21, 24 + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49 +} + +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #1 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/coalescer-subreg-join.mir +++ llvm/trunk/test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -0,0 +1,75 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s +# Check that %11 and %20 have been coalesced. +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]] +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]] + +--- +name: main +alignment: 0 +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_256 } + - { id: 4, class: sreg_128 } + - { id: 5, class: sreg_256 } + - { id: 6, class: sreg_128 } + - { id: 7, class: sreg_512 } + - { id: 9, class: vreg_512 } + - { id: 11, class: vreg_512 } + - { id: 18, class: vgpr_32 } + - { id: 20, class: vreg_512 } + - { id: 27, class: vgpr_32 } +liveins: + - { reg: '%sgpr2_sgpr3', virtual-reg: '%0' } + - { reg: '%vgpr2', virtual-reg: '%1' } + - { reg: '%vgpr3', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: %sgpr2_sgpr3, %vgpr2, %vgpr3 + + %0 = COPY %sgpr2_sgpr3 + %1 = COPY %vgpr2 + %2 = COPY %vgpr3 + %3 = S_LOAD_DWORDX8_IMM %0, 0 + %4 = S_LOAD_DWORDX4_IMM %0, 12 + %5 = S_LOAD_DWORDX8_IMM %0, 16 + %6 = S_LOAD_DWORDX4_IMM %0, 28 + undef %7.sub0 = S_MOV_B32 212739 + %20 = COPY %7 + %11 = COPY %20 + %11.sub1 = COPY %1 + %11.sub2 = COPY %1 + %11.sub3 = COPY %1 + %11.sub4 = COPY %1 + %11.sub5 = COPY %1 + %11.sub6 = COPY %1 + %11.sub7 = COPY %1 + %11.sub8 = COPY %1 + dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + %20.sub1 = COPY %2 + %20.sub2 = COPY %2 + %20.sub3 = COPY %2 + %20.sub4 = COPY %2 + %20.sub5 = COPY %2 + %20.sub6 = COPY %2 + %20.sub7 = COPY %2 + %20.sub8 = COPY %2 + dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + +... Index: llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; REQUIRES: asserts +; +; This test used to crash with the following assertion: +; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed. +; +; This was related to incorrectly calculating subregister live ranges +; (i.e. live interval subranges): subregister defs are not uses for that +; purpose. +; +; Check for a valid output: +; CHECK: tbuffer_store_format_x + +target triple = "amdgcn--" + +define amdgpu_gs void @main(i32 inreg %arg) #0 { +main_body: + %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 20) + %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 24) + %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 48) + %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3 + %array_vector5 = insertelement <4 x float> , float %tmp, i32 1 + %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2 + %array_vector9 = insertelement <4 x float> , float %tmp1, i32 1 + %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 + %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %bc = bitcast <4 x float> %array_vector3 to <4 x i32> + %tmp4 = extractelement <4 x i32> %bc, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32> + %tmp5 = extractelement <4 x i32> %bc49, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %array_vector21 = insertelement <4 x float> , float %tmp, i32 1 + %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2 + %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> + %tmp6 = extractelement <4 x i32> %bc52, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2 +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 + +attributes #0 = { nounwind "target-cpu"="tonga" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind } Index: llvm/trunk/test/CodeGen/AMDGPU/unigine-liveness-crash.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/unigine-liveness-crash.ll +++ llvm/trunk/test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; REQUIRES: asserts +; +; This test used to crash with the following assertion: +; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed. +; +; This was related to incorrectly calculating subregister live ranges +; (i.e. live interval subranges): subregister defs are not uses for that +; purpose. +; +; Check for a valid output. +; CHECK: image_sample_c + +target triple = "amdgcn--" + +@ddxy_lds = external addrspace(3) global [64 x i32] + +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 { +main_body: + %tmp = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg6, <2 x i32> %arg8) + %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp24 = extractelement <4 x float> %tmp23, i32 3 + %tmp25 = fmul float %tmp24, undef + %tmp26 = fmul float undef, %tmp + %tmp27 = fadd float %tmp26, undef + %tmp28 = bitcast float %tmp27 to i32 + %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 + %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1 + %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2 + %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp33 = extractelement <4 x float> %tmp32, i32 0 + %tmp34 = fadd float undef, %tmp33 + %tmp35 = fadd float %tmp34, undef + %tmp36 = fadd float %tmp35, undef + %tmp37 = fadd float %tmp36, undef + %tmp38 = fadd float %tmp37, undef + %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp40 = extractelement <4 x float> %tmp39, i32 0 + %tmp41 = extractelement <4 x float> %tmp39, i32 1 + %tmp42 = extractelement <4 x float> %tmp39, i32 2 + %tmp43 = extractelement <4 x float> %tmp39, i32 3 + %tmp44 = fmul float %tmp40, undef + %tmp45 = fmul float %tmp41, undef + %tmp46 = fmul float %tmp42, undef + %tmp47 = fmul float %tmp43, undef + %tmp48 = fadd float undef, %tmp44 + %tmp49 = fadd float undef, %tmp45 + %tmp50 = bitcast float %tmp27 to i32 + %tmp51 = bitcast float %tmp48 to i32 + %tmp52 = bitcast float %tmp49 to i32 + %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0 + %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1 + %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2 + %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp57 = extractelement <4 x float> %tmp56, i32 0 + %tmp58 = fadd float %tmp38, %tmp57 + %tmp59 = fadd float undef, %tmp46 + %tmp60 = fadd float undef, %tmp47 + %tmp61 = bitcast float %tmp59 to i32 + %tmp62 = bitcast float %tmp60 to i32 + %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1 + %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2 + %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp66 = extractelement <4 x float> %tmp65, i32 0 + %tmp67 = fadd float %tmp58, %tmp66 + %tmp68 = fmul float %tmp67, 1.250000e-01 + %tmp69 = fmul float %tmp68, undef + %tmp70 = fcmp une float %tmp69, 0.000000e+00 + br i1 %tmp70, label %IF26, label %ENDIF25 + +IF26: ; preds = %main_body + %tmp71 = bitcast float %tmp27 to i32 + %tmp72 = insertelement <4 x i32> undef, i32 %tmp71, i32 0 + br label %LOOP + +ENDIF25: ; preds = %IF29, %main_body + %.4 = phi float [ %tmp84, %IF29 ], [ %tmp68, %main_body ] + %tmp73 = fadd float %.4, undef + %tmp74 = call float @llvm.AMDGPU.clamp.(float %tmp73, float 0.000000e+00, float 1.000000e+00) + %tmp75 = fmul float undef, %tmp74 + %tmp76 = fmul float %tmp75, undef + %tmp77 = fadd float %tmp76, undef + %tmp78 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp77, 11 + %tmp79 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp78, float undef, 12 + %tmp80 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp79, float undef, 13 + %tmp81 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp80, float %tmp25, 14 + %tmp82 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp81, float undef, 15 + %tmp83 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp82, float %arg21, 24 + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp83 + +LOOP: ; preds = %ENDIF28, %IF26 + %.5 = phi float [ undef, %IF26 ], [ %tmp89, %ENDIF28 ] + br i1 false, label %IF29, label %ENDIF28 + +IF29: ; preds = %LOOP + %tmp84 = fmul float %.5, 3.125000e-02 + br label %ENDIF25 + +ENDIF28: ; preds = %LOOP + %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1 + %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2 + %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp88 = extractelement <4 x float> %tmp87, i32 0 + %tmp89 = fadd float undef, %tmp88 + br label %LOOP +} + +declare float @llvm.AMDGPU.clamp.(float, float, float) #1 +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #1 = { nounwind readnone }