Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -34820,6 +34820,24 @@ if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; + // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && + VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && + N0.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && + cast(N0.getOperand(2))->get() == ISD::SETLT) { + SDValue N00 = N0.getOperand(0); + // Only do this if we can avoid scalarizing the input. + if (ISD::isNormalLoad(N00.getNode()) || + (N00.getOpcode() == ISD::BITCAST && + N00.getOperand(0).getValueType() == MVT::v4f32)) { + SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, + DAG.getBitcast(MVT::v4f32, N00)); + return DAG.getZExtOrTrunc(V, dl, VT); + } + } + // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && @@ -41775,7 +41793,8 @@ } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); @@ -41796,7 +41815,7 @@ // Look through int->fp bitcasts that don't change the element width. unsigned EltWidth = SrcVT.getScalarSizeInBits(); - if (Src.getOpcode() == ISD::BITCAST && + if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); @@ -43759,7 +43778,7 @@ case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); - case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); + case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); case X86ISD::MGATHER: case X86ISD::MSCATTER: case ISD::MGATHER: Index: llvm/trunk/test/CodeGen/X86/pr42870.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr42870.ll +++ llvm/trunk/test/CodeGen/X86/pr42870.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=sse | FileCheck %s + +define i32 @foo(<4 x float>* %a) { +; CHECK-LABEL: foo: +; CHECK: ## %bb.0: ## %start +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: retl +start: + %0 = bitcast <4 x float>* %a to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 16 + %2 = icmp slt <4 x i32> %1, zeroinitializer + %3 = bitcast <4 x i1> %2 to i4 + %4 = zext i4 %3 to i32 + ret i32 %4 +} + +define i32 @bar(<4 x float> %a) { +; CHECK-LABEL: bar: +; CHECK: ## %bb.0: ## %start +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: retl +start: + %0 = bitcast <4 x float> %a to <4 x i32> + %1 = icmp slt <4 x i32> %0, zeroinitializer + %2 = bitcast <4 x i1> %1 to i4 + %3 = zext i4 %2 to i32 + ret i32 %3 +}