Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -22558,7 +22558,9 @@ /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts -/// to a simple store and scalar loads to extract the elements. +/// into a somewhat faster sequence. For i686, the best sequence is apparently +/// storing the value and loading scalars back, while for x64 we should +/// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); @@ -22617,36 +22619,61 @@ return SDValue(); // Ok, we've now decided to do the transformation. + // If 64-bit shifts are legal, use the extract-shift sequence, + // otherwise bounce the vector off the cache. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Vals[4]; SDLoc dl(InputVector); + + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { + SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(0, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(1, VecIdxTy)); + + SDValue ShAmt = DAG.getConstant(32, + DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); + Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); + Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); + } else { + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); - // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo(), false, false, 0); + EVT ElementType = InputVector.getValueType().getVectorElementType(); + unsigned EltSize = ElementType.getSizeInBits() / 8; - // Replace each use (extract) with a load of the appropriate element. - for (SmallVectorImpl::iterator UI = Uses.begin(), - UE = Uses.end(); UI != UE; ++UI) { - SDNode *Extract = *UI; + // Replace each use (extract) with a load of the appropriate element. + for (unsigned i = 0; i < 4; ++i) { + uint64_t Offset = EltSize * i; + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - // cOMpute the element's address. - SDValue Idx = Extract->getOperand(1); - unsigned EltSize = - InputVector.getValueType().getVectorElementType().getSizeInBits()/8; - uint64_t Offset = EltSize * cast(Idx)->getZExtValue(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + // Load the scalar. + Vals[i] = DAG.getLoad(ElementType, dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); - // Load the scalar. - SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, - ScalarAddr, MachinePointerInfo(), - false, false, false, 0); + } + } - // Replace the exact with the load. - DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + // Replace the extracts + for (SmallVectorImpl::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + SDValue Idx = Extract->getOperand(1); + uint64_t IdxVal = cast(Idx)->getZExtValue(); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. Index: llvm/trunk/test/CodeGen/X86/gather-addresses.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/gather-addresses.ll +++ llvm/trunk/test/CodeGen/X86/gather-addresses.ll @@ -1,35 +1,38 @@ ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN +; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32 ; rdar://7398554 ; When doing vector gather-scatter index calculation with 32-bit indices, -; bounce the vector off of cache rather than shuffling each individual +; use an efficient mov/shift sequence rather than shuffling each individual ; element out of the index vector. -; CHECK: foo: -; LIN: movaps (%rsi), %xmm0 -; LIN: andps (%rdx), %xmm0 -; LIN: movaps %xmm0, -24(%rsp) -; LIN: movslq -24(%rsp), %[[REG1:r.+]] -; LIN: movslq -20(%rsp), %[[REG2:r.+]] -; LIN: movslq -16(%rsp), %[[REG3:r.+]] -; LIN: movslq -12(%rsp), %[[REG4:r.+]] -; LIN: movsd (%rdi,%[[REG1]],8), %xmm0 -; LIN: movhpd (%rdi,%[[REG2]],8), %xmm0 -; LIN: movsd (%rdi,%[[REG3]],8), %xmm1 -; LIN: movhpd (%rdi,%[[REG4]],8), %xmm1 +; CHECK-LABEL: foo: +; LIN: movdqa (%rsi), %xmm0 +; LIN: pand (%rdx), %xmm0 +; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; LIN: movd %xmm0, %r[[REG2:.+]] +; LIN: movslq %e[[REG2]], %r[[REG1:.+]] +; LIN: sarq $32, %r[[REG2]] +; LIN: movslq %e[[REG4]], %r[[REG3:.+]] +; LIN: sarq $32, %r[[REG4]] +; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0 +; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0 +; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 +; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 -; WIN: movaps (%rdx), %xmm0 -; WIN: andps (%r8), %xmm0 -; WIN: movaps %xmm0, (%rsp) -; WIN: movslq (%rsp), %[[REG1:r.+]] -; WIN: movslq 4(%rsp), %[[REG2:r.+]] -; WIN: movslq 8(%rsp), %[[REG3:r.+]] -; WIN: movslq 12(%rsp), %[[REG4:r.+]] -; WIN: movsd (%rcx,%[[REG1]],8), %xmm0 -; WIN: movhpd (%rcx,%[[REG2]],8), %xmm0 -; WIN: movsd (%rcx,%[[REG3]],8), %xmm1 -; WIN: movhpd (%rcx,%[[REG4]],8), %xmm1 +; WIN: movdqa (%rdx), %xmm0 +; WIN: pand (%r8), %xmm0 +; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; WIN: movd %xmm0, %r[[REG2:.+]] +; WIN: movslq %e[[REG2]], %r[[REG1:.+]] +; WIN: sarq $32, %r[[REG2]] +; WIN: movslq %e[[REG4]], %r[[REG3:.+]] +; WIN: sarq $32, %r[[REG4]] +; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0 +; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0 +; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 +; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %a = load <4 x i32>* %i @@ -53,3 +56,35 @@ %v3 = insertelement <4 x double> %v2, double %r3, i32 3 ret <4 x double> %v3 } + +; Check that the sequence previously used above, which bounces the vector off the +; cache works for x86-32. Note that in this case it will not be used for index +; calculation, since indexes are 32-bit, not 64. +; CHECK-LABEL: old: +; LIN32: movaps %xmm0, (%esp) +; LIN32-DAG: {{(mov|and)}}l (%esp), +; LIN32-DAG: {{(mov|and)}}l 4(%esp), +; LIN32-DAG: {{(mov|and)}}l 8(%esp), +; LIN32-DAG: {{(mov|and)}}l 12(%esp), +define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind { + %a = load <4 x i32>* %i + %b = load <4 x i32>* %h + %j = and <4 x i32> %a, %b + %d0 = extractelement <4 x i32> %j, i32 0 + %d1 = extractelement <4 x i32> %j, i32 1 + %d2 = extractelement <4 x i32> %j, i32 2 + %d3 = extractelement <4 x i32> %j, i32 3 + %q0 = zext i32 %d0 to i64 + %q1 = zext i32 %d1 to i64 + %q2 = zext i32 %d2 to i64 + %q3 = zext i32 %d3 to i64 + %r0 = and i64 %q0, %f + %r1 = and i64 %q1, %f + %r2 = and i64 %q2, %f + %r3 = and i64 %q3, %f + %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0 + %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1 + %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2 + %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3 + ret <4 x i64> %v3 +}