Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
@@ -389,7 +389,8 @@
   bool isFPVectorizationPotentiallyUnsafe() const;
 
   /// \brief Determine if the target supports unaligned memory accesses.
-  bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace = 0,
+  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                      unsigned BitWidth, unsigned AddressSpace = 0,
                                       unsigned Alignment = 1,
                                       bool *Fast = nullptr) const;
 
@@ -668,7 +669,8 @@
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
-  virtual bool allowsMisalignedMemoryAccesses(unsigned BitWidth,
+  virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                              unsigned BitWidth,
                                               unsigned AddressSpace,
                                               unsigned Alignment,
                                               bool *Fast) = 0;
@@ -841,9 +843,10 @@
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
-  bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace,
+  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                      unsigned BitWidth, unsigned AddressSpace,
                                       unsigned Alignment, bool *Fast) override {
-    return Impl.allowsMisalignedMemoryAccesses(BitWidth, AddressSpace,
+    return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
                                                Alignment, Fast);
   }
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -253,7 +253,8 @@
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
-  bool allowsMisalignedMemoryAccesses(unsigned BitWidth,
+  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                      unsigned BitWidth,
                                       unsigned AddressSpace,
                                       unsigned Alignment,
                                       bool *Fast) { return false; }
Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
@@ -105,10 +105,11 @@
 
   /// \name Scalar TTI Implementations
   /// @{
-  bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace,
+  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                      unsigned BitWidth, unsigned AddressSpace,
                                       unsigned Alignment, bool *Fast) const {
-    MVT M = MVT::getIntegerVT(BitWidth);
-    return getTLI()->allowsMisalignedMemoryAccesses(M, AddressSpace, Alignment, Fast);
+    EVT E = EVT::getIntegerVT(Context, BitWidth);
+    return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast);
   }
 
   bool hasBranchDivergence() { return false; }
Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
@@ -186,11 +186,12 @@
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
 
-bool TargetTransformInfo::allowsMisalignedMemoryAccesses(unsigned BitWidth,
+bool TargetTransformInfo::allowsMisalignedMemoryAccesses(LLVMContext &Context,
+                                                         unsigned BitWidth,
                                                          unsigned AddressSpace,
                                                          unsigned Alignment,
                                                          bool *Fast) const {
-  return TTIImpl->allowsMisalignedMemoryAccesses(BitWidth, AddressSpace,
+  return TTIImpl->allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
                                                  Alignment, Fast);
 }
 
Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -439,8 +439,12 @@
 
   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
   // which isn't a simple VT.
-  if (!VT.isSimple() || VT == MVT::Other)
+  // Until MVT is extended to handle this, simply check for the size and
+  // rely on the condition below: allow accesses if the size is a multiple of 4.
+  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+                           VT.getStoreSize() > 16)) {
     return false;
+  }
 
   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
Index: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -40,9 +40,8 @@
 
 namespace {
 
-// TODO: Remove this
-static const unsigned TargetBaseAlign = 4;
-
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
 typedef SmallVector<Instruction *, 8> InstrList;
 typedef MapVector<Value *, InstrList> InstrListMap;
 
@@ -798,8 +797,8 @@
     // so we can cheat and change it!
     Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL);
     if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
-      AI->setAlignment(TargetBaseAlign);
-      Alignment = TargetBaseAlign;
+      AI->setAlignment(StackAdjustedAlignment);
+      Alignment = StackAdjustedAlignment;
     } else {
       return false;
     }
@@ -948,8 +947,8 @@
     // so we can cheat and change it!
     Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL);
     if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
-      AI->setAlignment(TargetBaseAlign);
-      Alignment = TargetBaseAlign;
+      AI->setAlignment(StackAdjustedAlignment);
+      Alignment = StackAdjustedAlignment;
     } else {
       return false;
     }
@@ -1029,10 +1028,13 @@
 
 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
                                     unsigned Alignment) {
+  if (Alignment % SzInBytes == 0)
+    return false;
   bool Fast = false;
-  bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace,
+  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+                                                   SzInBytes * 8, AddressSpace,
                                                    Alignment, &Fast);
-  // TODO: Remove TargetBaseAlign
-  return !(Allows && Fast) && (Alignment % SzInBytes) != 0 &&
-         (Alignment % TargetBaseAlign) != 0;
+  DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+               << " and fast? " << Fast << "\n";);
+  return !Allows || !Fast;
 }
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
@@ -22,9 +22,9 @@
 define void @load_fn(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn()
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -35,9 +35,9 @@
 define void @load_fn_nounwind(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_nounwind() #0
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -48,9 +48,9 @@
 define void @load_fn_nounwind_writeonly(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_nounwind_writeonly() #1
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -60,9 +60,9 @@
 define void @load_fn_nounwind_readonly(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_nounwind_readonly() #2
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -73,9 +73,9 @@
 define void @load_fn_readonly(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_readonly() #4
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -86,9 +86,9 @@
 define void @load_fn_writeonly(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_writeonly() #3
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -98,9 +98,9 @@
 define void @load_fn_readnone(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  %v0 = load i32, i32* %p
+  %v0 = load i32, i32* %p, align 8
   call void @fn_readnone() #5
-  %v1 = load i32, i32* %p.1
+  %v1 = load i32, i32* %p.1, align 4
   ret void
 }
 
@@ -193,9 +193,9 @@
 define void @store_fn_readnone(i32* %p) #0 {
   %p.1 = getelementptr i32, i32* %p, i32 1
 
-  store i32 0, i32* %p
+  store i32 0, i32* %p, align 8
   call void @fn_readnone() #5
-  store i32 0, i32* %p.1
+  store i32 0, i32* %p.1, align 8
   ret void
 }
 
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
@@ -6,7 +6,7 @@
 
 define void @foo() {
   ; CHECK: load <4 x float>
-  %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 4
+  %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 16
   %b = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 1), align 4
   %c = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 2), align 4
   %d = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 3), align 4
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
@@ -17,8 +17,8 @@
 define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 {
 entry:
   %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1
-  %buff.p = load i8*, i8** %tmp1, align 8
-  %buff.val = load i8, i8* %buff.p, align 8
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
   store i8 0, i8* %buff.p, align 8
   %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0
   %buff.int = load i32, i32* %tmp0, align 8
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
@@ -18,11 +18,11 @@
 define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 {
 entry:
   %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1, align 8
-  %buff.val = load i8, i8* %buff.p, align 8
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
   store i8 0, i8* %buff.p, align 8
   %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 8
+  %buff.int = load i64, i64* %tmp0, align 16
   ret void
 }
 
@@ -36,12 +36,12 @@
 entry:
   %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
   %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1, align 8
-  %buff.val = load i8, i8* %buff.p, align 8
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
   store i8 0, i8* %buff.p, align 8
   %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
   %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 8
+  %buff.int = load i64, i64* %tmp0, align 16
   ret void
 }
 
@@ -55,8 +55,8 @@
 define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) {
 entry:
   %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1, align 8
-  %buff.val = load i8, i8* %buff.p, align 8
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
   store i8 0, i8* %buff.p, align 8
   br label %"for something"
 
@@ -64,7 +64,7 @@
   %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ]
 
   %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 8
+  %buff.int = load i64, i64* %tmp0, align 16
 
   %index.next = add i64 %index, 8
   %cmp_res = icmp eq i64 %index.next, 8
Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
===================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"