Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2284,7 +2284,8 @@
       }
       if (bestWidth) {
         EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
-        if (newVT.isRound()) {
+        if (newVT.isRound() &&
+            shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
           EVT PtrType = Lod->getOperand(1).getValueType();
           SDValue Ptr = Lod->getBasePtr();
           if (bestOffset != 0)
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -667,6 +667,16 @@
   EVT OldVT = N->getValueType(0);
   unsigned OldSize = OldVT.getStoreSizeInBits();
 
+  MemSDNode *MN = cast<MemSDNode>(N);
+  unsigned AS = MN->getAddressSpace();
+  // Do not shrink an aligned scalar load to sub-dword.
+  // Scalar engine cannot do sub-dword loads.
+  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()) &&
+      (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
+    return false;
+
   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
   // extloads, so doing one requires using a buffer_load. In cases where we
   // still couldn't use a scalar load, using the wider load shouldn't really
Index: test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_shrink_dword_to_unaligned_byte:
+; GCN: global_load_ushort
+define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 2
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}