Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -93,6 +93,11 @@
              "stores respectively."),
     cl::Hidden);
 
+static cl::opt<bool>
+    EnableLightAVX("x86-light-avx", cl::init(false),
+                   cl::desc("Enable generation of 256 AVX load/stores,"
+                            "even whith -mprefered-vector-width=128"));
+
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
@@ -2657,7 +2662,7 @@
       }
       // FIXME: Check if unaligned 32-byte accesses are slow.
       if (Op.size() >= 32 && Subtarget.hasAVX() &&
-          (Subtarget.getPreferVectorWidth() >= 256)) {
+          (Subtarget.getPreferVectorWidth() >= 256 || EnableLightAVX)) {
         // Although this isn't a well-supported type for AVX1, we'll let
         // legalization and shuffle lowering produce the optimal codegen. If we
         // choose an optimal type with a vector element larger than a byte,
Index: llvm/test/CodeGen/X86/memcpy-light-avx.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/memcpy-light-avx.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=prefer-128-bit -x86-light-avx=true | FileCheck %s
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+
+define void @test1(ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups (%rsi), %ymm0
+; CHECK-NEXT:    vmovups %ymm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 32, i1 0 )
+  ret void
+}