Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15499,16 +15499,38 @@ // converts. } - if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) { + // TODO: These transforms should not require the 'hasOneUse' restriction, but + // there are regressions on multiple targets without it. We can end up with a + // mess of scalar and vector code if we reduce only part of the DAG to scalar. + if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() && + InVec.hasOneUse()) { // The vector index of the LSBs of the source depend on the endian-ness. bool IsLE = DAG.getDataLayout().isLittleEndian(); - + unsigned ExtractIndex = ConstEltNo->getZExtValue(); // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1; SDValue BCSrc = InVec.getOperand(0); - if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt && - VT.isInteger() && BCSrc.getValueType().isScalarInteger()) + if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); + + if (BCSrc.getValueType().isInteger() && + BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> + // trunc i64 X to i32 + SDValue X = BCSrc.getOperand(0); + assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() && + "Extract element and scalar to vector can't change element type " + "from FP to integer."); + unsigned XBitWidth = X.getValueSizeInBits(); + unsigned VecEltBitWidth = VT.getScalarSizeInBits(); + BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; + + // An extract element return value type can be wider than its vector + // operand element type. In that case, the high bits are undefined, so + // it's possible that we may need to extend rather than truncate. + if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) + return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT); + } } // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val Index: test/CodeGen/AArch64/extract-insert.ll =================================================================== --- test/CodeGen/AArch64/extract-insert.ll +++ test/CodeGen/AArch64/extract-insert.ll @@ -12,8 +12,7 @@ ; ; LE-LABEL: trunc_i64_to_i32_le: ; LE: // %bb.0: -; LE-NEXT: fmov d0, x0 -; LE-NEXT: fmov w0, s0 +; LE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; LE-NEXT: ret %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <4 x i32> @@ -24,9 +23,7 @@ define i32 @trunc_i64_to_i32_be(i64 %x) { ; BE-LABEL: trunc_i64_to_i32_be: ; BE: // %bb.0: -; BE-NEXT: fmov d0, x0 -; BE-NEXT: rev64 v0.4s, v0.4s -; BE-NEXT: mov w0, v0.s[1] +; BE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; BE-NEXT: ret ; ; LE-LABEL: trunc_i64_to_i32_be: @@ -50,8 +47,7 @@ ; ; LE-LABEL: trunc_i64_to_i16_le: ; LE: // %bb.0: -; LE-NEXT: fmov d0, x0 -; LE-NEXT: umov w0, v0.h[0] +; LE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; LE-NEXT: ret %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <8 x i16> @@ -62,9 +58,7 @@ define i16 @trunc_i64_to_i16_be(i64 %x) { ; BE-LABEL: trunc_i64_to_i16_be: ; BE: // %bb.0: -; BE-NEXT: fmov d0, x0 -; BE-NEXT: rev64 v0.8h, v0.8h -; BE-NEXT: umov w0, v0.h[3] +; BE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; BE-NEXT: ret ; ; LE-LABEL: trunc_i64_to_i16_be: @@ -88,8 +82,6 @@ ; ; LE-LABEL: trunc_i32_to_i8_le: ; LE: // %bb.0: -; LE-NEXT: fmov s0, w0 -; LE-NEXT: umov w0, v0.b[0] ; LE-NEXT: ret %ins = insertelement <4 x i32> undef, i32 %x, i32 0 %bc = bitcast <4 x i32> %ins to <16 x i8> @@ -100,9 +92,6 @@ define i8 @trunc_i32_to_i8_be(i32 %x) { ; BE-LABEL: trunc_i32_to_i8_be: ; BE: // %bb.0: -; BE-NEXT: fmov s0, w0 -; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: umov w0, v0.b[3] ; BE-NEXT: ret ; ; LE-LABEL: trunc_i32_to_i8_be: Index: test/CodeGen/X86/extract-insert.ll =================================================================== --- test/CodeGen/X86/extract-insert.ll +++ test/CodeGen/X86/extract-insert.ll @@ -68,8 +68,8 @@ ; ; X64-LABEL: trunc_i64_to_i32_le: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <4 x i32> @@ -86,9 +86,8 @@ ; ; X64-LABEL: trunc_i64_to_i16_le: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <8 x i16> Index: test/CodeGen/X86/mmx-coalescing.ll =================================================================== --- test/CodeGen/X86/mmx-coalescing.ll +++ test/CodeGen/X86/mmx-coalescing.ll @@ -16,16 +16,17 @@ ; CHECK-NEXT: # %bb.2: # %if.B ; CHECK-NEXT: pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3] ; CHECK-NEXT: movq %mm0, %rax -; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: .LBB0_1: # %if.A +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movd %edx, %mm1 ; CHECK-NEXT: psllq %mm1, %mm0 ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: jne .LBB0_4 -; CHECK-NEXT: .LBB0_3: # %if.C -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: # %bb.3: # %if.C +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: .LBB0_4: # %merge