Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -82,6 +82,9 @@ def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "IsUAMemFast", "true", "Fast unaligned memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -271,12 +274,14 @@ // rather than a superset. def : ProcessorModel<"corei7-avx", SandyBridgeModel, [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; + FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES, + FeaturePCLMUL]>; // Ivy Bridge def : ProcessorModel<"core-avx-i", SandyBridgeModel, [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase]>; + FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES, + FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase]>; // Haswell def : ProcessorModel<"core-avx2", HaswellModel, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -24376,11 +24376,12 @@ SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // On Sandybridge unaligned 256bit loads are inefficient. + // For chips with slow 32-byte unaligned loads, break the 32-byte operation + // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) @@ -24423,13 +24424,11 @@ SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatenation of two XMM registers, perform two stores. - // On Sandy Bridge, 256-bit memory operations are executed by two - // 128-bit ports. However, on Haswell it is better to issue a single 256-bit - // memory operation. + // If we are saving a concatenation of two XMM registers and 32-byte stores + // are slow, such as on Sandy Bridge, perform two 16-byte stores. unsigned Alignment = St->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && !Subtarget->hasInt256() && + if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -159,6 +159,9 @@ /// IsUAMemFast - True if unaligned memory access is fast. bool IsUAMemFast; + /// True if unaligned 32-byte memory accesses are slow. + bool IsUAMem32Slow; + /// HasVectorUAMem - True if SIMD operations can have unaligned memory /// operands. This may require setting a feature bit in the processor. bool HasVectorUAMem; @@ -374,6 +377,7 @@ bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -264,6 +264,7 @@ IsBTMemSlow = false; IsSHLDSlow = false; IsUAMemFast = false; + IsUAMem32Slow = false; HasVectorUAMem = false; HasCmpxchg16b = false; UseLeaForSP = false; Index: llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll +++ llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll @@ -1,13 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s - -define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp { -entry: - ; CHECK: vmovaps - ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]), - ; CHECK: vmovups - %A = load <4 x i32>* %Ap - %B = load <4 x i32>* %Bp - %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> - store <8 x i32> %Z, <8 x i32>* %P, align 16 - ret void -} Index: llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL + +; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load +; because that is slower than two 16-byte loads. +; Other AVX-capable chips don't have that problem. + +define <8 x float> @load32bytes(<8 x float>* %Ap) { + ; CHECK-LABEL: load32bytes + + ; SANDYB: vmovaps + ; SANDYB: vinsertf128 + ; SANDYB: retq + + ; BTVER2: vmovups + ; BTVER2: retq + + ; HASWELL: vmovups + ; HASWELL: retq + + %A = load <8 x float>* %Ap, align 16 + ret <8 x float> %A +} + +; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store +; because that is slowerthan two 16-byte stores. +; Other AVX-capable chips don't have that problem. + +define void @store32bytes(<8 x float> %A, <8 x float>* %P) { + ; CHECK-LABEL: store32bytes + + ; SANDYB: vextractf128 + ; SANDYB: vmovaps + ; SANDYB: retq + + ; BTVER2: vmovups + ; BTVER2: retq + + ; HASWELL: vmovups + ; HASWELL: retq + + store <8 x float> %A, <8 x float>* %P, align 16 + ret void +}