Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -21022,6 +21022,35 @@ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +/// Change a 256-bit vector store into a pair of 128-bit vector stores. +static SDValue split256BitStore(StoreSDNode *Store, SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert(StoredVal.getValueType().is256BitVector() && "Expecting 256-bit op"); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreVT = StoredVal.getSimpleValueType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + SDLoc DL(Store); + SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, DL); + SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, DL); + SDValue Ptr0 = Store->getBasePtr(); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, DL); + unsigned Alignment = Store->getAlignment(); + SDValue Ch0 = + DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), + Alignment, Store->getMemOperand()->getFlags()); + SDValue Ch1 = + DAG.getStore(Store->getChain(), DL, Value1, Ptr1, + Store->getPointerInfo().getWithOffset(16), + MinAlign(Alignment, 16), Store->getMemOperand()->getFlags()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); +} + static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); @@ -39345,20 +39374,7 @@ if (NumElems < 2) return SDValue(); - SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); - SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); - - SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); - - SDValue Ch0 = - DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), - Alignment, St->getMemOperand()->getFlags()); - SDValue Ch1 = - DAG.getStore(St->getChain(), dl, Value1, Ptr1, - St->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + return split256BitStore(St, DAG); } // Optimize trunc store (of multiple scalars) to shuffle and store. Index: llvm/trunk/test/CodeGen/X86/avx-load-store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-load-store.ll +++ llvm/trunk/test/CodeGen/X86/avx-load-store.ll @@ -187,8 +187,10 @@ define void @double_save_volatile(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind { ; CHECK-LABEL: double_save_volatile: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: double_save_volatile: