Index: lib/CodeGen/LiveIntervalAnalysis.cpp
===================================================================
--- lib/CodeGen/LiveIntervalAnalysis.cpp
+++ lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -506,7 +506,7 @@
 
   // Visit all instructions reading Reg.
   SlotIndex LastIdx;
-  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
     MachineInstr *UseMI = MO.getParent();
     if (UseMI->isDebugValue() || !MO.readsReg())
       continue;
@@ -514,8 +514,6 @@
     unsigned SubReg = MO.getSubReg();
     if (SubReg != 0) {
       LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
-      if (MO.isDef())
-        LaneMask = ~LaneMask & MRI->getMaxLaneMaskForVReg(Reg);
       if ((LaneMask & SR.LaneMask) == 0)
         continue;
     }
Index: lib/CodeGen/LiveRangeCalc.h
===================================================================
--- lib/CodeGen/LiveRangeCalc.h
+++ lib/CodeGen/LiveRangeCalc.h
@@ -160,6 +160,7 @@
   /// all uses must be jointly dominated by the definitions from @p LR
   /// together with definitions of other lanes where @p LR becomes undefined
   /// (via <def,read-undef> operands).
+  /// If @p LR is a main range, the @p LaneMask should be set to ~0.
   void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask,
                     LiveInterval *LI = nullptr);
 
Index: lib/CodeGen/LiveRangeCalc.cpp
===================================================================
--- lib/CodeGen/LiveRangeCalc.cpp
+++ lib/CodeGen/LiveRangeCalc.cpp
@@ -163,13 +163,14 @@
     LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes);
 
   // Visit all operands that read Reg. This may include partial defs.
+  bool IsSubRange = (Mask != ~0U);
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     // Clear all kill flags. They will be reinserted after register allocation
     // by LiveIntervalAnalysis::addKillFlags().
     if (MO.isUse())
       MO.setIsKill(false);
-    if (!MO.readsReg())
+    if (!MO.readsReg() || (IsSubRange && MO.isDef()))
       continue;
 
     unsigned SubReg = MO.getSubReg();
Index: lib/CodeGen/SplitKit.cpp
===================================================================
--- lib/CodeGen/SplitKit.cpp
+++ lib/CodeGen/SplitKit.cpp
@@ -1206,7 +1206,8 @@
       // defining the register. This is because a <def,read-undef> operand
       // will create an "undef" point, and we cannot extend any subranges
       // until all of them have been accounted for.
-      ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
+      if (MO.isUse())
+        ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
     } else {
       LiveRangeCalc &LRC = getLRCalc(RegIdx);
       LRC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
@@ -1221,10 +1222,6 @@
     unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
     LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
                               : MRI.getMaxLaneMaskForVReg(Reg);
-    // If this is a non-read-undef definition of a sub-register, extend
-    // subranges for everything except that sub-register.
-    if (Sub != 0 && EP.MO.isDef())
-      LM = MRI.getMaxLaneMaskForVReg(Reg) & ~LM;
     for (LiveInterval::SubRange &S : LI.subranges()) {
       if (!(S.LaneMask & LM))
         continue;
Index: test/CodeGen/AMDGPU/coalescer-subreg-join.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -0,0 +1,75 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
+# Check that %11 and %20 have been coalesced.
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]]
+
+---
+name:            main
+alignment:       0
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_256 }
+  - { id: 4, class: sreg_128 }
+  - { id: 5, class: sreg_256 }
+  - { id: 6, class: sreg_128 }
+  - { id: 7, class: sreg_512 }
+  - { id: 9, class: vreg_512 }
+  - { id: 11, class: vreg_512 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 20, class: vreg_512 }
+  - { id: 27, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr2_sgpr3', virtual-reg: '%0' }
+  - { reg: '%vgpr2', virtual-reg: '%1' }
+  - { reg: '%vgpr3', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: %sgpr2_sgpr3, %vgpr2, %vgpr3
+
+    %0 = COPY %sgpr2_sgpr3
+    %1 = COPY %vgpr2
+    %2 = COPY %vgpr3
+    %3 = S_LOAD_DWORDX8_IMM %0, 0
+    %4 = S_LOAD_DWORDX4_IMM %0, 12
+    %5 = S_LOAD_DWORDX8_IMM %0, 16
+    %6 = S_LOAD_DWORDX4_IMM %0, 28
+    undef %7.sub0 = S_MOV_B32 212739
+    %20 = COPY %7
+    %11 = COPY %20
+    %11.sub1 = COPY %1
+    %11.sub2 = COPY %1
+    %11.sub3 = COPY %1
+    %11.sub4 = COPY %1
+    %11.sub5 = COPY %1
+    %11.sub6 = COPY %1
+    %11.sub7 = COPY %1
+    %11.sub8 = COPY %1
+    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec
+    %20.sub1 = COPY %2
+    %20.sub2 = COPY %2
+    %20.sub3 = COPY %2
+    %20.sub4 = COPY %2
+    %20.sub5 = COPY %2
+    %20.sub6 = COPY %2
+    %20.sub7 = COPY %2
+    %20.sub8 = COPY %2
+    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec
+
+...
Index: test/CodeGen/AMDGPU/scheduler-liveness-1.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scheduler-liveness-1.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; Check for a sane output. This testcase used to crash.
+; CHECK: image_sample_c
+
+target triple = "amdgcn--"
+
+define amdgpu_ps void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %1 = load <4 x i32>, <4 x i32> addrspace(2)* null, align 16, !invariant.load !0
+  %2 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> zeroinitializer, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = fmul float %3, 2.000000e+00
+  %5 = fadd float %4, -1.000000e+00
+  %6 = fmul float %5, undef
+  %7 = fadd float %6, undef
+  %8 = call float @llvm.AMDGPU.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %9 = fmul float %8, 0.000000e+00
+  %10 = fadd float undef, %9
+  %11 = fmul float %0, undef
+  %12 = fadd float %11, undef
+  %13 = fadd float undef, undef
+  %14 = bitcast float %12 to i32
+  %15 = bitcast float %13 to i32
+  %16 = insertelement <4 x i32> undef, i32 %14, i32 0
+  %17 = insertelement <4 x i32> %16, i32 %15, i32 1
+  %18 = insertelement <4 x i32> %17, i32 undef, i32 2
+  %19 = bitcast float %12 to i32
+  %20 = insertelement <4 x i32> undef, i32 %19, i32 0
+  %21 = insertelement <4 x i32> %20, i32 0, i32 1
+  %22 = insertelement <4 x i32> %21, i32 0, i32 2
+  %23 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %22, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fadd float undef, %24
+  %26 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %27 = extractelement <4 x float> %26, i32 0
+  %28 = fadd float %25, %27
+  %29 = fadd float %28, undef
+  %30 = fadd float undef, undef
+  %31 = bitcast float %12 to i32
+  %32 = bitcast float %30 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 2
+  %36 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %35, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = fadd float %29, %37
+  %39 = fmul float %38, 1.250000e-01
+  %40 = fmul float %39, 8.000000e+00
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP, %main_body
+  %.038 = phi float [ 0x36C0000000000000, %main_body ], [ 0.000000e+00, %LOOP ]
+  %.5 = phi float [ %40, %main_body ], [ undef, %LOOP ]
+  %41 = bitcast float %.038 to i32
+  %42 = icmp sgt i32 %41, 15
+  br i1 %42, label %IF29, label %LOOP
+
+IF29:                                             ; preds = %LOOP
+  %43 = fmul float %.5, 3.125000e-02
+  %44 = fadd float %43, undef
+  %45 = call float @llvm.AMDGPU.clamp.(float %44, float 0.000000e+00, float 1.000000e+00)
+  %46 = fmul float %10, %45
+  %47 = fmul float %46, undef
+  %48 = fadd float %47, undef
+  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %48, 11
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.AMDGPU.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{}
Index: test/CodeGen/AMDGPU/scheduler-liveness-2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scheduler-liveness-2.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; Check for a sane output. This testcase used to crash.
+; CHECK: tbuffer_store_format_x
+
+target triple = "amdgcn--"
+
+define amdgpu_gs void @main(i32 inreg) {
+main_body:
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 36)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 40)
+  %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float undef, i32 1
+  %array_vector22 = insertelement <4 x float> %array_vector21, float %1, i32 2
+  %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
+  %array_vector25 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float undef, i32 1
+  %array_vector26 = insertelement <4 x float> %array_vector25, float %2, i32 2
+  %array_vector27 = insertelement <4 x float> %array_vector26, float 0.000000e+00, i32 3
+  %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
+  %3 = extractelement <4 x i32> %bc52, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %3, i32 1, i32 64, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc53 = bitcast <4 x float> %array_vector27 to <4 x i32>
+  %4 = extractelement <4 x i32> %bc53, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %4, i32 1, i32 76, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
Index: test/CodeGen/AMDGPU/unigine-liveness-crash.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -0,0 +1,140 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; Check for a sane output. This testcase used to crash.
+; CHECK: image_sample_c
+
+target triple = "amdgcn--"
+
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+main_body:
+  %23 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %6, <2 x i32> %8)
+  %24 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = fmul float %25, undef
+  %27 = fmul float undef, %23
+  %28 = fadd float %27, undef
+  %29 = bitcast float %28 to i32
+  %30 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %31 = insertelement <4 x i32> %30, i32 0, i32 1
+  %32 = insertelement <4 x i32> %31, i32 undef, i32 2
+  %33 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %34 = extractelement <4 x float> %33, i32 0
+  %35 = fadd float undef, %34
+  %36 = fadd float %35, undef
+  %37 = fadd float %36, undef
+  %38 = fadd float %37, undef
+  %39 = fadd float %38, undef
+  %40 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = extractelement <4 x float> %40, i32 1
+  %43 = extractelement <4 x float> %40, i32 2
+  %44 = extractelement <4 x float> %40, i32 3
+  %45 = fmul float %41, undef
+  %46 = fmul float %42, undef
+  %47 = fmul float %43, undef
+  %48 = fmul float %44, undef
+  %49 = fadd float undef, %45
+  %50 = fadd float undef, %46
+  %51 = bitcast float %28 to i32
+  %52 = bitcast float %49 to i32
+  %53 = bitcast float %50 to i32
+  %54 = insertelement <4 x i32> undef, i32 %51, i32 0
+  %55 = insertelement <4 x i32> %54, i32 %52, i32 1
+  %56 = insertelement <4 x i32> %55, i32 %53, i32 2
+  %57 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %56, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %58 = extractelement <4 x float> %57, i32 0
+  %59 = fadd float %39, %58
+  %60 = fadd float undef, %47
+  %61 = fadd float undef, %48
+  %62 = bitcast float %60 to i32
+  %63 = bitcast float %61 to i32
+  %64 = insertelement <4 x i32> undef, i32 %62, i32 1
+  %65 = insertelement <4 x i32> %64, i32 %63, i32 2
+  %66 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %65, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %67 = extractelement <4 x float> %66, i32 0
+  %68 = fadd float %59, %67
+  %69 = fmul float %68, 1.250000e-01
+  %70 = fmul float %69, undef
+  %71 = fcmp une float %70, 0.000000e+00
+  br i1 %71, label %IF26, label %ENDIF25
+
+IF26:                                             ; preds = %main_body
+  %72 = bitcast float %28 to i32
+  %73 = insertelement <4 x i32> undef, i32 %72, i32 0
+  br label %LOOP
+
+ENDIF25:                                          ; preds = %IF29, %main_body
+  %.4 = phi float [ %85, %IF29 ], [ %69, %main_body ]
+  %74 = fadd float %.4, undef
+  %75 = call float @llvm.AMDGPU.clamp.(float %74, float 0.000000e+00, float 1.000000e+00)
+  %76 = fmul float undef, %75
+  %77 = fmul float %76, undef
+  %78 = fadd float %77, undef
+  %79 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %78, 11
+  %80 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %79, float undef, 12
+  %81 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %80, float undef, 13
+  %82 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %81, float %26, 14
+  %83 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %82, float undef, 15
+  %84 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %83, float %21, 24
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %84
+
+LOOP:                                             ; preds = %ENDIF28, %IF26
+  %.5 = phi float [ undef, %IF26 ], [ %90, %ENDIF28 ]
+  br i1 false, label %IF29, label %ENDIF28
+
+IF29:                                             ; preds = %LOOP
+  %85 = fmul float %.5, 3.125000e-02
+  br label %ENDIF25
+
+ENDIF28:                                          ; preds = %LOOP
+  %86 = insertelement <4 x i32> %73, i32 undef, i32 1
+  %87 = insertelement <4 x i32> %86, i32 undef, i32 2
+  %88 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %87, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %89 = extractelement <4 x float> %88, i32 0
+  %90 = fadd float undef, %89
+  br label %LOOP
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+; Function Attrs: convergent nounwind readnone
+declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.AMDGPU.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.pow.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.maxnum.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind readnone }