Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -983,10 +983,10 @@ // Each load/store unit costs 1. int Cost = LT.first * 1; - // On Sandybridge 256bit load/stores are double pumped - // (but not on Haswell). - if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) - Cost*=2; + // This isn't exactly right. We're using slow unaligned 32-byte accesses as a + // proxy for a double-pumped AVX memory interface such as on Sandybridge. + if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) + Cost *= 2; return Cost; } Index: test/Transforms/LoopVectorize/X86/avx1.ll =================================================================== --- test/Transforms/LoopVectorize/X86/avx1.ll +++ test/Transforms/LoopVectorize/X86/avx1.ll @@ -26,10 +26,10 @@ ret i32 undef } -;;; FIXME: If 32-byte accesses are fast, this should use a <4 x i64> load. ; CHECK-LABEL: @read_mod_i64( -; CHECK: load <2 x i64> +; SLOWMEM32: load <2 x i64> +; FASTMEM32: load <4 x i64> ; CHECK: ret i32 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0