Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -192,6 +192,13 @@ // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMemUnder32Slow = false; + InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with Index: llvm/trunk/test/CodeGen/X86/slow-unaligned-mem.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/slow-unaligned-mem.ll +++ llvm/trunk/test/CodeGen/X86/slow-unaligned-mem.ll @@ -55,6 +55,11 @@ ; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores. ; Chips that don't have SSE use 4-byte stores either way, so they're not tested. +; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses. + +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2 2>&1 | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a 2>&1 | FileCheck %s --check-prefix=FAST + define void @store_zeros(i8* %a) { ; SLOW-NOT: not a recognized processor ; SLOW-LABEL: store_zeros: