Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,6 +75,9 @@
   }
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+
   bool isSourceOfDivergence(const Value *V) const;
 };
 
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -156,6 +156,108 @@
   }
 }
 
+int AMDGPUTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Align, unsigned AS) {
+  // TODO: We should not use the default accounting for the scalarization of
+  // illegal vector types when they can be successfully merged into fewer loads.
+
+  // FIXME: The base implementation should probably account for
+  // allowsMisalignedMemoryAccess, but unaligned accesses are expanded in a
+  // variety of different ways.
+
+  const unsigned SMRDOpCost = 2;
+  const unsigned BufferOpCost = 5;
+
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(Src);
+
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+  default: {
+    // TODO: Account for alignment restrictions.
+
+    if (VectorType *VT = dyn_cast<VectorType>(Src)) {
+      unsigned NElts = VT->getNumElements();
+      Type *EltTy = VT->getElementType();
+      unsigned EltSize = DL.getTypeAllocSize(EltTy);
+
+      // v8i32 and v16i32 vectors are legal, but the largest store is 16 bytes,
+      // so ignore what the default cost derived from whether the type is legal
+      // and assume the vector is split correctly.
+      if (EltSize == 4) {
+        unsigned RoundedNElts = (NElts + 3) / 4;
+        return BufferOpCost * RoundedNElts;
+      }
+    }
+
+    int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS);
+    return BufferOpCost * BaseCost;
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // LDS is pretty fast assuming no bank conflicts.
+    const unsigned DSOpCost = 3;
+
+    // These don't have the larger load/store sizes, so estimate how the load
+    // will be broken up.
+    VectorType *VT = dyn_cast<VectorType>(Src);
+
+    unsigned Size = DL.getTypeAllocSize(Src);
+    // This only has 32-bit and 64-bit loads and stores available even though
+    // larger vector types are legal, so estimate how many this will be split
+    // into. Ignore the base vector legalization cost.
+    if (Align == 1)
+      return DSOpCost * Size;
+
+    int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS);
+
+    // Somewhat hacky way to test for scalarization.
+    if (BaseCost == 1 && Align == 2)
+      return DSOpCost * Size / 2;
+
+    if (VT) {
+      unsigned NElts = VT->getNumElements();
+      Type *EltTy = VT->getElementType();
+      unsigned EltSize = DL.getTypeAllocSize(EltTy);
+
+      if (EltSize == 4) {
+        unsigned RoundedNElts = (NElts + 1) / 2;
+        return DSOpCost * RoundedNElts;
+      }
+
+      if (EltSize == 8)
+        return DSOpCost * NElts;
+
+      if (EltSize < 4)
+        return BaseCost * DSOpCost;
+    }
+
+    assert(Align >= 4);
+    return BaseCost * DSOpCost;
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    int BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Align, AS);
+
+    // SMRD requires 4-byte alignment, otherwise we must use buffer
+    // instructions.
+
+    // FIXME: We should be able to handle >= 4 byte aligned sub-dword types.
+    if (Align < 4 || DL.getTypeAllocSize(Src) < 4)
+      return BufferOpCost * BaseCost;
+
+    // FIXME: Scalarized illegal types not correctly handled.
+
+    // If uniformly accessed, SMRD instructions are faster than buffer/flat
+    // instructions.
+    return SMRDOpCost * BaseCost;
+  }
+  }
+
+  llvm_unreachable("cannot happen");
+}
+
 static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
                                           const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
Index: test/Analysis/CostModel/AMDGPU/memory-ops.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/memory-ops.ll
@@ -0,0 +1,459 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; CHECK: 'store_global_i32'
+; CHECK: estimated cost of 5 for {{.*}} store i32
+define void @store_global_i32(i32 addrspace(1)* %out) #0 {
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v2i32'
+; CHECK: estimated cost of 5 for {{.*}} store <2 x i32>
+define void @store_global_v2i32(<2 x i32> addrspace(1)* %out) #0 {
+  store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v3i32'
+; CHECK: estimated cost of 5 for {{.*}} store <3 x i32>
+define void @store_global_v3i32(<3 x i32> addrspace(1)* %out) #0 {
+  store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v4i32'
+; CHECK: estimated cost of 5 for {{.*}} store <4 x i32>
+define void @store_global_v4i32(<4 x i32> addrspace(1)* %out) #0 {
+  store <4 x i32> zeroinitializer, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v8i32'
+; CHECK: estimated cost of 10 for {{.*}} store <8 x i32>
+define void @store_global_v8i32(<8 x i32> addrspace(1)* %out) #0 {
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v16i32'
+; CHECK: estimated cost of 20 for {{.*}} store <16 x i32>
+define void @store_global_v16i32(<16 x i32> addrspace(1)* %out) #0 {
+  store <16 x i32> zeroinitializer, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v32i32'
+; CHECK: estimated cost of 40 for {{.*}} store <32 x i32>
+define void @store_global_v32i32(<32 x i32> addrspace(1)* %out) #0 {
+  store <32 x i32> zeroinitializer, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v64i32'
+; CHECK: estimated cost of 80 for {{.*}} store <64 x i32>
+define void @store_global_v64i32(<64 x i32> addrspace(1)* %out) #0 {
+  store <64 x i32> zeroinitializer, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_i8'
+; CHECK: estimated cost of 5 for {{.*}} store i8
+define void @store_global_i8(i8 addrspace(1)* %out) #0 {
+  store i8 0, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v4i8'
+; CHECK: estimated cost of 20 for {{.*}} store <4 x i8>
+define void @store_global_v4i8(<4 x i8> addrspace(1)* %out) #0 {
+  store <4 x i8> zeroinitializer, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_i16'
+; CHECK: estimated cost of 5 for {{.*}} store i16
+define void @store_global_i16(i16 addrspace(1)* %out) #0 {
+  store i16 0, i16 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v4i16'
+; CHECK: estimated cost of 20 for {{.*}} store <4 x i16>
+define void @store_global_v4i16(<4 x i16> addrspace(1)* %out) #0 {
+  store <4 x i16> zeroinitializer, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v8i16'
+; CHECK: estimated cost of 40 for {{.*}} store <8 x i16>
+define void @store_global_v8i16(<8 x i16> addrspace(1)* %out) #0 {
+  store <8 x i16> zeroinitializer, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_private_i32'
+; CHECK: estimated cost of 5 for {{.*}} store i32
+define void @store_private_i32(i32* %out) #0 {
+  store i32 0, i32* %out
+  ret void
+}
+
+; CHECK: 'store_private_v2i32'
+; CHECK: estimated cost of 5 for {{.*}} store <2 x i32>
+define void @store_private_v2i32(<2 x i32>* %out) #0 {
+  store <2 x i32> zeroinitializer, <2 x i32>* %out
+  ret void
+}
+
+; CHECK: 'store_private_v3i32'
+; CHECK: estimated cost of 5 for {{.*}} store <3 x i32>
+define void @store_private_v3i32(<3 x i32>* %out) #0 {
+  store <3 x i32> zeroinitializer, <3 x i32>* %out
+  ret void
+}
+
+; CHECK: 'store_private_v4i32'
+; CHECK: estimated cost of 5 for {{.*}} store <4 x i32>
+define void @store_private_v4i32(<4 x i32>* %out) #0 {
+  store <4 x i32> zeroinitializer, <4 x i32>* %out
+  ret void
+}
+
+; CHECK: 'store_private_v4i8'
+; CHECK: estimated cost of 20 for {{.*}} store <4 x i8>
+define void @store_private_v4i8(<4 x i8>* %out) #0 {
+  store <4 x i8> zeroinitializer, <4 x i8>* %out
+  ret void
+}
+
+; CHECK: 'store_global_i64'
+; CHECK: estimated cost of 5 for {{.*}} store i64
+define void @store_global_i64(i64 addrspace(1)* %out) #0 {
+  store i64 0, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_i64_align_1'
+; CHECK: estimated cost of 5 for {{.*}} store i64
+define void @store_global_i64_align_1(i64 addrspace(1)* %out) #0 {
+  store i64 0, i64 addrspace(1)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_global_v2i64'
+; CHECK: estimated cost of 5 for {{.*}} store <2 x i64>
+define void @store_global_v2i64(<2 x i64> addrspace(1)* %out) #0 {
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v2i64_align_1'
+; CHECK: estimated cost of 5 for {{.*}} store <2 x i64>
+define void @store_global_v2i64_align_1(<2 x i64> addrspace(1)* %out) #0 {
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(1)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_global_v3i64'
+; CHECK: estimated cost of 10 for {{.*}} store <3 x i64>
+define void @store_global_v3i64(<3 x i64> addrspace(1)* %out) #0 {
+  store <3 x i64> zeroinitializer, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v4i64'
+; CHECK: estimated cost of 10 for {{.*}} store <4 x i64>
+define void @store_global_v4i64(<4 x i64> addrspace(1)* %out) #0 {
+  store <4 x i64> zeroinitializer, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_global_v8i64'
+; CHECK: estimated cost of 20 for {{.*}} store <8 x i64>
+define void @store_global_v8i64(<8 x i64> addrspace(1)* %out) #0 {
+  store <8 x i64> zeroinitializer, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i32'
+; CHECK: estimated cost of 3 for {{.*}} store i32
+define void @store_local_i32(i32 addrspace(3)* %out) #0 {
+  store i32 0, i32 addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i32_align_1'
+; CHECK: estimated cost of 12 for {{.*}} store i32
+define void @store_local_i32_align_1(i32 addrspace(3)* %out) #0 {
+  store i32 0, i32 addrspace(3)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_local_i32_align_2'
+; CHECK: estimated cost of 6 for {{.*}} store i32
+define void @store_local_i32_align_2(i32 addrspace(3)* %out) #0 {
+  store i32 0, i32 addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK: 'store_local_v2i32'
+; CHECK: estimated cost of 3 for {{.*}} store <2 x i32>
+define void @store_local_v2i32(<2 x i32> addrspace(3)* %out) #0 {
+  store <2 x i32> zeroinitializer, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v3i32'
+; CHECK: estimated cost of 6 for {{.*}} store <3 x i32>
+define void @store_local_v3i32(<3 x i32> addrspace(3)* %out) #0 {
+  store <3 x i32> zeroinitializer, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v4i32'
+; CHECK: estimated cost of 6 for {{.*}} store <4 x i32>
+define void @store_local_v4i32(<4 x i32> addrspace(3)* %out) #0 {
+  store <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v8i32'
+; CHECK: estimated cost of 12 for {{.*}} store <8 x i32>
+define void @store_local_v8i32(<8 x i32> addrspace(3)* %out) #0 {
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v16i32'
+; CHECK: estimated cost of 24 for {{.*}} store <16 x i32>
+define void @store_local_v16i32(<16 x i32> addrspace(3)* %out) #0 {
+  store <16 x i32> zeroinitializer, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v32i32'
+; CHECK: estimated cost of 48 for {{.*}} store <32 x i32>
+define void @store_local_v32i32(<32 x i32> addrspace(3)* %out) #0 {
+  store <32 x i32> zeroinitializer, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i8'
+; CHECK: estimated cost of 3 for {{.*}} store i8
+define void @store_local_i8(i8 addrspace(3)* %out) #0 {
+  store i8 0, i8 addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v4i8'
+; CHECK: estimated cost of 12 for {{.*}} store <4 x i8>
+define void @store_local_v4i8(<4 x i8> addrspace(3)* %out) #0 {
+  store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v4i8_align_1'
+; CHECK: estimated cost of 12 for {{.*}} store <4 x i8>
+define void @store_local_v4i8_align_1(<4 x i8> addrspace(3)* %out) #0 {
+  store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_local_v4i8_align_2'
+; CHECK: estimated cost of 12 for {{.*}} store <4 x i8>
+define void @store_local_v4i8_align_2(<4 x i8> addrspace(3)* %out) #0 {
+  store <4 x i8> zeroinitializer, <4 x i8> addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK: 'store_local_i16'
+; CHECK: estimated cost of 3 for {{.*}} store i16
+define void @store_local_i16(i16 addrspace(3)* %out) #0 {
+  store i16 0, i16 addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i16_align_4'
+; CHECK: estimated cost of 3 for {{.*}} store i16
+define void @store_local_i16_align_4(i16 addrspace(3)* %out) #0 {
+  store i16 0, i16 addrspace(3)* %out, align 4
+  ret void
+}
+
+; CHECK: 'store_local_v4i16'
+; CHECK: estimated cost of 12 for {{.*}} store <4 x i16>
+define void @store_local_v4i16(<4 x i16> addrspace(3)* %out) #0 {
+  store <4 x i16> zeroinitializer, <4 x i16> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v8i16'
+; CHECK: estimated cost of 24 for {{.*}} store <8 x i16>
+define void @store_local_v8i16(<8 x i16> addrspace(3)* %out) #0 {
+  store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i64'
+; CHECK: estimated cost of 3 for {{.*}} store i64
+define void @store_local_i64(i64 addrspace(3)* %out) #0 {
+  store i64 0, i64 addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_i64_align_1'
+; CHECK: estimated cost of 24 for {{.*}} store i64
+define void @store_local_i64_align_1(i64 addrspace(3)* %out) #0 {
+  store i64 0, i64 addrspace(3)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_local_i64_align_2'
+; CHECK: estimated cost of 12 for {{.*}} store i64
+define void @store_local_i64_align_2(i64 addrspace(3)* %out) #0 {
+  store i64 0, i64 addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK: 'store_local_v2i64'
+; CHECK: estimated cost of 6 for {{.*}} store <2 x i64>
+define void @store_local_v2i64(<2 x i64> addrspace(3)* %out) #0 {
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v2i64_align_1'
+; CHECK: estimated cost of 48 for {{.*}} store <2 x i64>
+define void @store_local_v2i64_align_1(<2 x i64> addrspace(3)* %out) #0 {
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out, align 1
+  ret void
+}
+
+; CHECK: 'store_local_v2i64_align_2'
+; CHECK: estimated cost of 24 for {{.*}} store <2 x i64>
+define void @store_local_v2i64_align_2(<2 x i64> addrspace(3)* %out) #0 {
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK: 'store_local_v3i64'
+; CHECK: estimated cost of 9 for {{.*}} store <3 x i64>
+define void @store_local_v3i64(<3 x i64> addrspace(3)* %out) #0 {
+  store <3 x i64> zeroinitializer, <3 x i64> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v4i64'
+; CHECK: estimated cost of 12 for {{.*}} store <4 x i64>
+define void @store_local_v4i64(<4 x i64> addrspace(3)* %out) #0 {
+  store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; CHECK: 'store_local_v8i64'
+; CHECK: estimated cost of 24 for {{.*}} store <8 x i64>
+define void @store_local_v8i64(<8 x i64> addrspace(3)* %out) #0 {
+  store <8 x i64> zeroinitializer, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+
+; CHECK: 'load_constant_i32'
+; CHECK: estimated cost of 2 for {{.*}} load i32
+define void @load_constant_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+  %val = load i32, i32 addrspace(2)* %in
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_i32_align_2'
+; CHECK: estimated cost of 5 for {{.*}} load i32
+define void @load_constant_i32_align_2(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+  %val = load i32, i32 addrspace(2)* %in, align 2
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_i32_align_1'
+; CHECK: estimated cost of 5 for {{.*}} load i32
+define void @load_constant_i32_align_1(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+  %val = load i32, i32 addrspace(2)* %in, align 1
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_i8'
+; CHECK: estimated cost of 5 for {{.*}} load i8
+define void @load_constant_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %val = load i8, i8 addrspace(2)* %in
+  store i8 %val, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_i8_align_4'
+; CHECK: estimated cost of 5 for {{.*}} load i8
+define void @load_constant_i8_align_4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %val = load i8, i8 addrspace(2)* %in, align 4
+  store i8 %val, i8 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This currently is actually using buffer instructions on the scalarized vector.
+
+; CHECK: 'load_constant_v4i8'
+; CHECK: estimated cost of 8 for {{.*}} load <4 x i8>
+define void @load_constant_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %val = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v4i32'
+; CHECK: estimated cost of 2 for {{.*}} load <4 x i32>
+define void @load_constant_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v4i32_align_4'
+; CHECK: estimated cost of 2 for {{.*}} load <4 x i32>
+define void @load_constant_v4i32_align_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(2)* %in, align 4
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v4i32_align_1'
+; CHECK: estimated cost of 5 for {{.*}} load <4 x i32>
+define void @load_constant_v4i32_align_1(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(2)* %in, align 1
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v8i32'
+; CHECK: estimated cost of 2 for {{.*}} load <8 x i32>
+define void @load_constant_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+  %val = load <8 x i32>, <8 x i32> addrspace(2)* %in
+  store <8 x i32> %val, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v16i32'
+; CHECK: estimated cost of 2 for {{.*}} load <16 x i32>
+define void @load_constant_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+  %val = load <16 x i32>, <16 x i32> addrspace(2)* %in
+  store <16 x i32> %val, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'load_constant_v32i32'
+; CHECK: estimated cost of 4 for {{.*}} load <32 x i32>
+define void @load_constant_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+  %val = load <32 x i32>, <32 x i32> addrspace(2)* %in
+  store <32 x i32> %val, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }