diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5638,6 +5638,43 @@ SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); + EVT OpVT = N0.getValueType(); + // This is a fixup if we converted (cmp Op8/Op16, i8) -> (cmp Op32, i32) + // during lowering. + if (OpVT.isScalarInteger() && OpVT.getScalarSizeInBits() > 8 && + isa(N1) && !isNullConstant(N1) && + hasNoSignFlagUses(SDValue(Node, 0))) { + const APInt &C = cast(N1)->getAPIntValue(); + // Only do replacement if the constant can get imm8 encoding. Imm16 values + // cause LCP stalls in the frontend. + // TODO: Enable imm16 transform aswell if -Os is set? + if (C.getSignificantBits() <= 8) { + MVT NewVT; + if(CurDAG->MaskedValueIsZero( + N0, APInt::getBitsSetFrom(OpVT.getScalarSizeInBits(), 8))) + NewVT = MVT::i8; + else if(CurDAG->MaskedValueIsZero( + N0, APInt::getBitsSetFrom(OpVT.getScalarSizeInBits(), 16))) + NewVT = MVT::i16; + else + break; + + SDValue TruncN0 = CurDAG->getZExtOrTrunc(N0, dl, NewVT); + insertDAGNode(*CurDAG, SDValue(Node, 0), TruncN0); + SDValue TruncN1 = CurDAG->getConstant( + C.truncSSat(NewVT.getScalarSizeInBits()).getZExtValue(), dl, + MVT::i32); + insertDAGNode(*CurDAG, SDValue(Node, 0), TruncN1); + SDValue NewCmp = + CurDAG->getNode(X86ISD::CMP, dl, MVT::i32, TruncN0, TruncN1); + ReplaceNode(Node, NewCmp.getNode()); + if (N1.getNode()->use_empty()) + CurDAG->RemoveDeadNode(N1.getNode()); + SelectCode(NewCmp.getNode()); + return; + } + } + // Optimizations for TEST compares. if (!isNullConstant(N1)) break; diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -41,7 +41,7 @@ ; SSE-NEXT: xorpd %xmm1, %xmm1 ; SSE-NEXT: cmpeqpd %xmm0, %xmm1 ; SSE-NEXT: movmskpd %xmm1, %eax -; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: cmpb $3, %al ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -95,14 +95,14 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: cmpl $15, %eax +; SSE2-NEXT: cmpb $15, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: pmovmskb_allof_bitcast_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movmskpd %xmm0, %eax -; SSE42-NEXT: cmpl $3, %eax +; SSE42-NEXT: cmpb $3, %al ; SSE42-NEXT: sete %al ; SSE42-NEXT: retq ; @@ -151,7 +151,7 @@ ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cmpeqps %xmm0, %xmm1 ; SSE-NEXT: movmskps %xmm1, %eax -; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: cmpb $15, %al ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -681,14 +681,14 @@ ; SSE-LABEL: test_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpw $255, %ax ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: cmpw $-1, %ax +; AVX-NEXT: cmpw $255, %ax ; AVX-NEXT: sete %al ; AVX-NEXT: retq %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll @@ -697,13 +697,13 @@ define i1 @test_v2i8(ptr %ptr) nounwind { ; SSE-LABEL: test_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: cmpw $-1, (%rdi) +; SSE-NEXT: cmpw $255, (%rdi) ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: cmpw $-1, (%rdi) +; AVX-NEXT: cmpw $255, (%rdi) ; AVX-NEXT: sete %al ; AVX-NEXT: retq %vload = load <2 x i8>, ptr %ptr