Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -946,6 +946,9 @@
   }
 
   // default case
+
+  // FIXME: This is broken on SI where we still need to check if the base
+  // pointer is positive here.
   Base = Addr;
   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include <llvm/CodeGen/Passes.h>
 
 using namespace llvm;
@@ -148,6 +149,13 @@
 
 namespace {
 
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+  "amdgpu-load-store-vectorizer",
+  cl::desc("Enable load store vectorizer"),
+  cl::init(true),
+  cl::Hidden);
+
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
@@ -244,6 +252,9 @@
     addPass(createSROAPass());
   }
   TargetPassConfig::addCodeGenPrepare();
+
+  if (EnableLoadStoreVectorizer)
+    addPass(createLoadStoreVectorizerPass());
 }
 
 bool
Index: test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -25,7 +25,7 @@
   %gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ]
   %gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ]
   %lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ]
-  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 1
+  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 2
   %val0 = load i32, i32 addrspace(1)* %gep0
   store i32 %val0, i32 addrspace(3)* %lptr0.phi
   %val1 = load i32, i32 addrspace(1)* %gep1
Index: test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
===================================================================
--- test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -10,17 +10,17 @@
 ; CHECK: BB0_1:
 ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
-; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 4, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
 ; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], vcc, 0x84, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
 define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
@@ -36,13 +36,13 @@
   tail call void @llvm.amdgcn.s.barrier() #1
   %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
   %tmp = load float, float addrspace(3)* %arrayidx, align 4
-  %add1 = add nsw i32 %offset.02, 1
+  %add1 = add nsw i32 %offset.02, 2
   %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
   %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
   %add3 = add nsw i32 %offset.02, 32
   %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
   %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
-  %add5 = add nsw i32 %offset.02, 33
+  %add5 = add nsw i32 %offset.02, 34
   %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
   %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
   %add7 = add nsw i32 %offset.02, 64
Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,8 +10,8 @@
 
 ; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}}
 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
 
 define void @offset_order(float addrspace(1)* %out) {
 entry:
Index: test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
===================================================================
--- test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GatherAllAliases gives up on trying to analyze cases where the
 ; pointer may have been loaded from an aliased store, so make sure
@@ -32,4 +32,4 @@
 
 !0 = !{}
 
-attributes #0 = { nounwind }
\ No newline at end of file
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/madak.ll
===================================================================
--- test/CodeGen/AMDGPU/madak.ll
+++ test/CodeGen/AMDGPU/madak.ll
@@ -47,17 +47,17 @@
   %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %in.gep.0, align 4
-  %b = load float, float addrspace(1)* %in.gep.1, align 4
-  %c = load float, float addrspace(1)* %in.gep.2, align 4
+  %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
+  %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
 
   %mul0 = fmul float %a, %b
   %mul1 = fmul float %a, %c
   %madak0 = fadd float %mul0, 10.0
   %madak1 = fadd float %mul1, 10.0
 
-  store float %madak0, float addrspace(1)* %out.gep.0, align 4
-  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4
   ret void
 }
 
Index: test/CodeGen/AMDGPU/merge-stores.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-stores.ll
+++ test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,9 +1,10 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
 
-; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
+; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
 
 ; TODO: Vector element tests
Index: test/CodeGen/AMDGPU/private-memory.ll
===================================================================
--- test/CodeGen/AMDGPU/private-memory.ll
+++ test/CodeGen/AMDGPU/private-memory.ll
@@ -1,9 +1,9 @@
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -amdgpu-load-store-vectorizer=0 < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
Index: test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
===================================================================
--- test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; Pointer value is stored in a candidate for LDS usage.
 
Index: test/CodeGen/AMDGPU/schedule-global-loads.ll
===================================================================
--- test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
-
 ; FIXME: This currently doesn't do a great job of clustering the
 ; loads, which end up with extra moves between them. Right now, it
 ; seems the only things areLoadsFromSameBasePtr is accomplishing is
@@ -8,12 +7,12 @@
 
 ; FUNC-LABEL: {{^}}cluster_global_arg_loads:
 ; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
 define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
   store i32 %load0, i32 addrspace(1)* %out0, align 4
   store i32 %load1, i32 addrspace(1)* %out1, align 4
Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -11,13 +11,13 @@
 
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; CI: buffer_store_dword
+; CI-NEXT: buffer_store_dword
+; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -32,12 +32,12 @@
 ; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store volatile i32 99, i32 addrspace(1)* %gptr, align 4
@@ -51,13 +51,13 @@
 
 ; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -75,13 +75,13 @@
 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
-; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
 define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -97,14 +97,14 @@
 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
-; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -143,7 +143,7 @@
 ; CI: buffer_store_dword
 define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -160,15 +160,15 @@
 ; loads and stores, and I'm not sure that it really is.
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
 
   store i32 123, i32 addrspace(3)* %ptr1, align 4
   %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
@@ -184,18 +184,17 @@
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
 
   store i32 123, i32 addrspace(1)* %ptr1, align 4
   %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4