diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -66,8 +66,11 @@
   const SITargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
   bool IsGraphics;
+  bool IsPixelShader;
   bool HasFP32Denormals;
   bool HasFP64FP16Denormals;
+  bool UseAggressiveLoadSinking;
+  bool CanAlwaysSinkLoads;
 
   static const FeatureBitset InlineFeatureIgnoreList;
 
@@ -219,6 +222,8 @@
   InstructionCost getMinMaxReductionCost(
       VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
       TTI::TargetCostKind CostKind);
+
+  bool canAlwaysSinkRead(const Instruction *Inst, const BasicBlock *Dest) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -14,6 +14,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -288,10 +289,16 @@
     : BaseT(TM, F.getParent()->getDataLayout()),
       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()), CommonTTI(TM, F),
-      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
+      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
+      IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS) {
   AMDGPU::SIModeRegisterDefaults Mode(F);
   HasFP32Denormals = Mode.allFP32Denormals();
   HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
+
+  UseAggressiveLoadSinking =
+    F.getFnAttribute("amdgpu-aggressive-load-sinking").getValueAsBool();
+  CanAlwaysSinkLoads =
+    F.getFnAttribute("amdgpu-always-sink-loads").getValueAsBool();
 }
 
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
@@ -1209,3 +1216,27 @@
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+bool GCNTTIImpl::canAlwaysSinkRead(const Instruction *Inst,
+                                   const BasicBlock *Dest) const {
+  if (!UseAggressiveLoadSinking)
+    return false;
+
+  // An earlier pass may have established there are no stores/exec manipulation
+  if (CanAlwaysSinkLoads)
+    return true;
+
+  bool Result = Inst->hasMetadata(LLVMContext::MD_invariant_load);
+  if (Result && IsPixelShader) {
+    // Instructions using implicit derivatives in WQM cannot be safely sunk
+    // as kills/demotes may alter exec mask.
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+      if (auto *Intr = AMDGPU::getImageDimIntrinsicInfo(II->getIntrinsicID())) {
+        const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+              AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+        Result = !BaseOpcode->WQM;
+      }
+    }
+  }
+  return Result;
+}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -48,6 +48,7 @@
   bit IsAtomicRet = 0;
   bit MSAA = 0;
   bit BVH = 0;
+  bit WQM = 0;
 }
 
 def MIMGBaseOpcode : GenericEnum {
@@ -59,7 +60,7 @@
   let CppTypeName = "MIMGBaseOpcodeInfo";
   let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
-                "LodOrClampOrMip", "HasD16", "MSAA", "BVH"];
+                "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "WQM"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
 
   let PrimaryKey = ["BaseOpcode"];
@@ -1086,6 +1087,7 @@
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = !not(isGetLod);
     let G16 = isG16;
+    let WQM = wqm;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -1111,6 +1113,7 @@
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = 1;
     let Gather4 = 1;
+    let WQM = wqm;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -313,6 +313,7 @@
   bool HasD16;
   bool MSAA;
   bool BVH;
+  bool WQM;
 };
 
 LLVM_READONLY
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll b/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1010 -sink < %s | FileCheck --check-prefix=CHECK %s
+
+; Sink invariant load
+define amdgpu_ps <4 x float> @invariant_load(<4 x float> addrspace(4)* %in, i32 %s) #0 {
+; CHECK-LABEL: @invariant_load(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[S]], 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = load <4 x float>, <4 x float> addrspace(4)* [[IN:%.*]], align 4, !invariant.load !0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = load <4 x float>, <4 x float> addrspace(4)* %in, align 4, !invariant.load !0
+  %c = icmp eq i32 %s, 0
+  br i1 %c, label %block, label %end
+block:
+  %z = add i32 %s, 1
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Cannot sink load not marked as invariant
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) #0 {
+; CHECK-LABEL: @load_1d(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[S]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[S]], 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %c = icmp eq i32 %s, 0
+  br i1 %c, label %block, label %end
+block:
+  %z = add i32 %s, 1
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Can sink invariant load
+define amdgpu_ps <4 x float> @load_1d_invariant(<8 x i32> inreg %rsrc, i32 %s) #0 {
+; CHECK-LABEL: @load_1d_invariant(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[S]], 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0), !invariant.load !0
+  %c = icmp eq i32 %s, 0
+  br i1 %c, label %block, label %end
+block:
+  %z = add i32 %s, 1
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Can sink any load with attribute set
+define amdgpu_ps <4 x float> @load_1d_forced(<8 x i32> inreg %rsrc, i32 %s) #1 {
+; CHECK-LABEL: @load_1d_forced(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[S]], 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %c = icmp eq i32 %s, 0
+  br i1 %c, label %block, label %end
+block:
+  %z = add i32 %s, 1
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Cannot sink sample with implicit derivatives
+define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 {
+; CHECK-LABEL: @sample_2d(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[S]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[S]], 1.000000e+00
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0
+  %c = fcmp oeq float %s, 0.0
+  br i1 %c, label %block, label %end
+block:
+  %z = fadd float %s, 1.0
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Can sink sample explicit derivatives
+define amdgpu_ps <4 x float> @sample_2d_d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 {
+; CHECK-LABEL: @sample_2d_d(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[S]], 1.000000e+00
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0
+  %c = fcmp oeq float %s, 0.0
+  br i1 %c, label %block, label %end
+block:
+  %z = fadd float %s, 1.0
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Can sink sample with explicit LOD
+define amdgpu_ps <4 x float> @sample_2d_l(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) #0 {
+; CHECK-LABEL: @sample_2d_l(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[S]], 1.000000e+00
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0
+  %c = fcmp oeq float %s, 0.0
+  br i1 %c, label %block, label %end
+block:
+  %z = fadd float %s, 1.0
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Cannot sink gather4 with implicit derivatives
+define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 {
+; CHECK-LABEL: @gather4_2d(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[S]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[S]], 1.000000e+00
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0
+  %c = fcmp oeq float %s, 0.0
+  br i1 %c, label %block, label %end
+block:
+  %z = fadd float %s, 1.0
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+; Can sink gather4 with explicit LOD
+define amdgpu_ps <4 x float> @gather4_2d_l_o(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) #0 {
+; CHECK-LABEL: @gather4_2d_l_o(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK:       block:
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[S]], 1.000000e+00
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0
+  %c = fcmp oeq float %s, 0.0
+  br i1 %c, label %block, label %end
+block:
+  %z = fadd float %s, 1.0
+  br label %end
+end:
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
+
+attributes #0 = { nounwind "amdgpu-aggressive-load-sinking"="true" }
+attributes #1 = { nounwind "amdgpu-aggressive-load-sinking"="true" "amdgpu-always-sink-loads"="true" }
+attributes #2 = { nounwind readonly }
+
+!0 = !{}