diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17640,6 +17640,53 @@ return SDValue(); } +/// Transform a vector operation on a splatted vector into a scalar operation on +/// the splat value. +static SDValue scalarizeSplatValue(SDValue Vec, EVT ResVT, SelectionDAG &DAG) { + // Don't scalarize if we need the full vector anyway. + // TODO: Handle the case where there are multiple extract_elt users + if (!Vec.hasOneUse() || Vec.getNode()->getNumValues() != 1) + return SDValue(); + + SDLoc DL(Vec); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT ScalarVT = Vec.getValueType().getScalarType(); + unsigned Opc = Vec.getOpcode(); + + // Don't scalarize if the target does not support the scalar operation + if (!TLI.isOperationLegalOrCustomOrPromote(Opc, ResVT)) + return SDValue(); + + // TODO: Check for all the structures recognized by SelectionDAG::isSplatValue + switch (Opc) { + default: + break; + case ISD::SPLAT_VECTOR: { + SDValue SplatVal = Vec.getOperand(0); + return ResVT == ScalarVT ? SplatVal + : DAG.getAnyExtOrTrunc(SplatVal, DL, ResVT); + } + case ISD::BUILD_VECTOR: { + int SplatIdx; + if (!DAG.getSplatSourceVector(Vec, SplatIdx)) + break; + SDValue SplatVal = Vec.getOperand(SplatIdx); + return ResVT == ScalarVT ? SplatVal + : DAG.getAnyExtOrTrunc(SplatVal, DL, ResVT); + } + case ISD::ADD: + case ISD::SUB: + case ISD::AND: { + SDValue LHS, RHS; + if ((LHS = scalarizeSplatValue(Vec.getOperand(0), ResVT, DAG)) && + (RHS = scalarizeSplatValue(Vec.getOperand(1), ResVT, DAG))) + return DAG.getNode(Opc, DL, ResVT, LHS, RHS); + break; + } + } + return SDValue(); +} + SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { SDValue VecOp = N->getOperand(0); SDValue Index = N->getOperand(1); @@ -17751,6 +17798,9 @@ if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) return BO; + if (SDValue SO = scalarizeSplatValue(VecOp, ScalarVT, DAG)) + return SO; + // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -24,17 +24,8 @@ define void @test2(float * %p1, i32 %v1) { ; CHECK-LABEL: test2: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: movi.16b v0, #63 -; CHECK-NEXT: and x8, x1, #0x3 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: bfi x9, x8, #2, #2 -; CHECK-NEXT: ldr s0, [x9] -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: mov w8, #1061109567 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 diff --git a/llvm/test/CodeGen/SystemZ/vec-trunc-to-i1.ll b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i1.ll --- a/llvm/test/CodeGen/SystemZ/vec-trunc-to-i1.ll +++ b/llvm/test/CodeGen/SystemZ/vec-trunc-to-i1.ll @@ -7,13 +7,10 @@ ; CHECK-LABEL: pr32275: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: vlgvb %r0, %v24, 3 -; CHECK-NEXT: vlvgp %v0, %r0, %r0 -; CHECK-NEXT: vrepif %v1, 1 -; CHECK-NEXT: vn %v0, %v0, %v1 -; CHECK-NEXT: vlgvf %r0, %v0, 3 ; CHECK-NEXT: .LBB0_1: # %CF34 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cijlh %r0, 0, .LBB0_1 +; CHECK-NEXT: tmll %r0, 1 +; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %CF36 ; CHECK-NEXT: br %r14 BB: diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll --- a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -10,12 +10,9 @@ ; CHECK-LABEL: shl_add: ; CHECK-NEXT: .functype shl_add (v128, i32, i32) -> (v128) -; CHECK-NEXT: i8x16.splat $push1=, $1 -; CHECK-NEXT: i8x16.splat $push0=, $2 -; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 -; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0 -; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: i32.add $push0=, $1, $2 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: return $pop1 define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -785,18 +785,18 @@ ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $3, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: movw $-8193, %di ## imm = 0xDFFF -; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kandw %k0, %k1, %k1 +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $2, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k6 +; KNL-NEXT: korw %k6, %k0, %k6 ; KNL-NEXT: movw $-16385, %di ## imm = 0xBFFF -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kandw %k1, %k6, %k6 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kandw %k0, %k6, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k7 ; KNL-NEXT: kshiftlw $14, %k7, %k7 @@ -885,36 +885,26 @@ ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $3, %k3, %k3 ; KNL-NEXT: korw %k3, %k2, %k2 -; KNL-NEXT: kandw %k0, %k2, %k0 +; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: xorl %ecx, %ecx -; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) -; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF -; KNL-NEXT: movl $0, %esi -; KNL-NEXT: cmovnel %edx, %esi -; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) -; KNL-NEXT: cmovnel %edx, %ecx -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kandw %k0, %k1, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kshiftlw $14, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: kmovw %esi, %k1 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; KNL-NEXT: kandw %k2, %k0, %k0 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kandw %k1, %k2, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftrw $2, %k0, %k1 @@ -936,28 +926,30 @@ ; KNL-NEXT: kshiftrw $10, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $12, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: movb %r8b, 2(%rax) -; KNL-NEXT: kmovw %k0, %r8d -; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: andb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: movb %cl, 2(%rax) +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: andl $1, %ecx ; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: leal (%r8,%r9,2), %r8d -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: leal (%rcx,%r9,2), %r9d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: andl $1, %r10d -; KNL-NEXT: leal (%r8,%r10,4), %r8d +; KNL-NEXT: leal (%r9,%r10,4), %r9d ; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: leal (%r8,%r11,8), %r8d +; KNL-NEXT: leal (%r9,%r11,8), %r9d ; KNL-NEXT: andl $1, %r12d ; KNL-NEXT: shll $4, %r12d -; KNL-NEXT: orl %r8d, %r12d +; KNL-NEXT: orl %r9d, %r12d ; KNL-NEXT: andl $1, %r15d ; KNL-NEXT: shll $5, %r15d ; KNL-NEXT: orl %r12d, %r15d @@ -976,20 +968,20 @@ ; KNL-NEXT: shll $10, %ebp ; KNL-NEXT: orl %esi, %ebp ; KNL-NEXT: orl %r15d, %ebp -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: shll $11, %ecx +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: shll $11, %r8d ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: shll $12, %edx -; KNL-NEXT: orl %ecx, %edx +; KNL-NEXT: orl %r8d, %edx ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: shll $13, %edi ; KNL-NEXT: orl %edx, %edi -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: shll $14, %r9d -; KNL-NEXT: orl %edi, %r9d +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: shll $14, %ecx +; KNL-NEXT: orl %edi, %ecx ; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: shll $15, %r10d -; KNL-NEXT: orl %r9d, %r10d +; KNL-NEXT: orl %ecx, %r10d ; KNL-NEXT: orl %ebp, %r10d ; KNL-NEXT: movw %r10w, (%rax) ; KNL-NEXT: popq %rbx @@ -1538,107 +1530,99 @@ ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 -; KNL_X32-NEXT: xorl %eax, %eax -; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF -; KNL_X32-NEXT: movl $0, %edx -; KNL_X32-NEXT: cmovnel %ecx, %edx ; KNL_X32-NEXT: kandw %k0, %k1, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl -; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 ; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 ; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl -; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 ; KNL_X32-NEXT: korw %k1, %k0, %k0 -; KNL_X32-NEXT: kmovw %edx, %k1 -; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: cmovnel %ecx, %eax -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload -; KNL_X32-NEXT: kandw %k2, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kandw %k1, %k2, %k1 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %dl ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $4, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ecx ; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: movb %bl, 2(%eax) -; KNL_X32-NEXT: kmovw %k0, %ebx -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: leal (%ebx,%esi,2), %esi -; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 +; KNL_X32-NEXT: andb {{[0-9]+}}(%esp), %dl +; KNL_X32-NEXT: movzbl %dl, %edx +; KNL_X32-NEXT: andl $1, %edx +; KNL_X32-NEXT: movb %dl, 2(%eax) +; KNL_X32-NEXT: kmovw %k0, %edx +; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: leal (%esi,%edi,4), %esi +; KNL_X32-NEXT: leal (%edx,%edi,2), %edx ; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: leal (%edx,%ebx,4), %edx +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: leal (%esi,%ebp,8), %esi +; KNL_X32-NEXT: leal (%edx,%ebp,8), %edx ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: shll $4, %edx -; KNL_X32-NEXT: orl %esi, %edx -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: shll $4, %esi +; KNL_X32-NEXT: orl %edx, %esi +; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx -; KNL_X32-NEXT: orl %edx, %ecx -; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: orl %esi, %ecx +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $6, %ebx ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $7, %edi -; KNL_X32-NEXT: orl %ebx, %edi -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: shll $6, %edi +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $7, %ebx +; KNL_X32-NEXT: orl %edi, %ebx +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $8, %ebp -; KNL_X32-NEXT: orl %edi, %ebp -; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: orl %ebx, %ebp +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $9, %esi -; KNL_X32-NEXT: orl %ebp, %esi +; KNL_X32-NEXT: andl $1, %edx +; KNL_X32-NEXT: shll $9, %edx +; KNL_X32-NEXT: orl %ebp, %edx ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: shll $10, %edx -; KNL_X32-NEXT: orl %esi, %edx -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: shll $10, %esi +; KNL_X32-NEXT: orl %edx, %esi +; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 -; KNL_X32-NEXT: orl %ecx, %edx +; KNL_X32-NEXT: orl %ecx, %esi ; KNL_X32-NEXT: kmovw %k0, %ecx -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $11, %ebx ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $12, %edi -; KNL_X32-NEXT: orl %ebx, %edi +; KNL_X32-NEXT: shll $11, %edi +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $12, %ebx +; KNL_X32-NEXT: orl %edi, %ebx ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $13, %ebp -; KNL_X32-NEXT: orl %edi, %ebp -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $14, %esi -; KNL_X32-NEXT: orl %ebp, %esi +; KNL_X32-NEXT: orl %ebx, %ebp +; KNL_X32-NEXT: andl $1, %edx +; KNL_X32-NEXT: shll $14, %edx +; KNL_X32-NEXT: orl %ebp, %edx ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $15, %ecx -; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx +; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) ; KNL_X32-NEXT: addl $20, %esp ; KNL_X32-NEXT: popl %esi