Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
@@ -53,6 +54,7 @@
   DivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   bool HasUnsafeFPMath = false;
+  AMDGPUAS AMDGPUASI;
 
   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
@@ -133,6 +135,7 @@
 
   bool visitInstruction(Instruction &I) { return false; }
   bool visitBinaryOperator(BinaryOperator &I);
+  bool visitLoadInst(LoadInst &I);
   bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
 
@@ -441,6 +444,34 @@
   return Changed;
 }
 
+bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
+  Type *Ty = I.getType();
+  const DataLayout &DL = Mod->getDataLayout();
+  int TySize = DL.getTypeSizeInBits(Ty);
+  unsigned Align = I.getAlignment() ?
+	               I.getAlignment() : DL.getABITypeAlignment(Ty);
+
+  if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+      !I.isVolatile() && TySize < 32 && DA->isUniform(&I) && Align >= 4) {
+    IRBuilder<> Builder(&I);
+    Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+    Type *I32Ty = Builder.getInt32Ty();
+    Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
+    Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
+    Value *WidenLoad = Builder.CreateLoad(BitCast);
+
+    Type *IntNTy = Builder.getIntNTy(TySize);
+    Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
+    Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
+    I.replaceAllUsesWith(ValOrig);
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
 
Index: test/CodeGen/AMDGPU/unaligned-load-store.ll
===================================================================
--- test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -519,7 +519,7 @@
 }
 
 ; SI-LABEL: {{^}}constant_align4_load_i8:
-; SI: buffer_load_ubyte
+; SI: s_load_dword
 ; SI: buffer_store_byte
 define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
@@ -528,7 +528,7 @@
 }
 
 ; SI-LABEL: {{^}}constant_align2_load_i8:
-; SI: buffer_load_ubyte
+; SI: s_load_dword
 ; SI: buffer_store_byte
 define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
Index: test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
@@ -0,0 +1,192 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+; FUNC-LABEL: {{^}}constant_load_i1:
+; GCN: load i1
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
+  %val = load i1, i1 addrspace(2)* %in
+  store i1 %val, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i1i_align2:
+; GCN: load i1
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
+  %val = load i1, i1 addrspace(2)* %in, align 2
+  store i1 %val, i1 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i1i_align4:
+; GCN: bitcast
+; GCN-NEXT: load
+; GCN-NEXT: trunc
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
+  %val = load i1, i1 addrspace(2)* %in, align 4
+  store i1 %val, i1 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i8:
+; GCN: load i8
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %val = load i8, i8 addrspace(2)* %in
+  store i8 %val, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i8_align2:
+; GCN: load i8
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %val = load i8, i8 addrspace(2)* %in, align 2
+  store i8 %val, i8 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i8align4:
+; GCN: bitcast
+; GCN-NEXT: load
+; GCN-NEXT: trunc
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %val = load i8, i8 addrspace(2)* %in, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}constant_load_v2i8:
+; GCN: load <2 x i8>
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i8i_align4:
+; GCN: bitcast
+; GCN-NEXT: load i32
+; GCN-NEXT: trunc
+; GCN-NEXT: bitcast
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i8:
+; GCN: bitcast
+; GCN-NEXT: load
+; GCN-NEXT: trunc
+; GCN-NEXT: bitcast
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i8_align4:
+; GCN: bitcast
+; GCN-NEXT: load
+; GCN-NEXT: trunc
+; GCN-NEXT: bitcast
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4
+  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i16:
+; GCN: load i16
+; GCN: sext
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %ld = load i16, i16 addrspace(2)* %in
+  %ext = sext i16 %ld to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_i16_align4:
+; GCN: bitcast
+; GCN-NEXT: load
+; GCN-NEXT: trunc
+; GCN-NEXT: sext
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %ld = load i16, i16 addrspace(2)* %in, align 4
+  %ext = sext i16 %ld to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_half:
+; GCN: load half
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_half(half addrspace(1)* %out, half addrspace(2)* %in) #0 {
+  %ld = load half, half addrspace(2)* %in
+  store half %ld, half addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2half:
+; GCN: load <2 x half>
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 {
+  %ld = load <2 x half>, <2 x half> addrspace(2)* %in
+  store <2 x half> %ld, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL:{{^}}load_volatile:
+; GCN: load volatile
+; GCN-NEXT: store
+define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+  %a = load volatile i16, i16 addrspace(2)* %in
+  store i16 %a, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i8_volatile:
+; GCN: load volatile
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i8_addrespace1:
+; GCN: load <2 x i8>
+; GCN-NEXT: store
+define amdgpu_kernel void @constant_load_v2i8_addrespace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}use_dispatch_ptr:
+; GCN: bitcast
+; GCN-NEXT: load i32
+; GCN-NEXT: trunc
+; GCN-NEXT: zext
+; GCN-NEXT: store
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4
+  %ld = zext i8 %val to i32
+  store i32 %ld, i32 addrspace(1)* %ptr
+  ret void
+}
+
+attributes #0 = { nounwind }