diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4759,20 +4759,25 @@ unsigned LoReg, HiReg; bool IsSigned = Opcode == ISD::SMUL_LOHI; bool UseMULX = !IsSigned && Subtarget->hasBMI2(); + bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i32: - Opc = UseMULX ? X86::MULX32rr : + Opc = UseMULXHi ? X86::MULX32Hrr : + UseMULX ? X86::MULX32rr : IsSigned ? X86::IMUL32r : X86::MUL32r; - MOpc = UseMULX ? X86::MULX32rm : + MOpc = UseMULXHi ? X86::MULX32Hrm : + UseMULX ? X86::MULX32rm : IsSigned ? X86::IMUL32m : X86::MUL32m; LoReg = UseMULX ? X86::EDX : X86::EAX; HiReg = X86::EDX; break; case MVT::i64: - Opc = UseMULX ? X86::MULX64rr : + Opc = UseMULXHi ? X86::MULX64Hrr : + UseMULX ? X86::MULX64rr : IsSigned ? X86::IMUL64r : X86::MUL64r; - MOpc = UseMULX ? X86::MULX64rm : + MOpc = UseMULXHi ? X86::MULX64Hrm : + UseMULX ? X86::MULX64rm : IsSigned ? X86::IMUL64m : X86::MUL64m; LoReg = UseMULX ? X86::RDX : X86::RAX; HiReg = X86::RDX; @@ -4796,7 +4801,12 @@ MachineSDNode *CNode = nullptr; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - if (UseMULX) { + if (UseMULXHi) { + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + Chain = SDValue(CNode, 1); + } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); @@ -4815,7 +4825,11 @@ CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { SDValue Ops[] = { N1, InFlag }; - if (UseMULX) { + if (UseMULXHi) { + SDVTList VTs = CurDAG->getVTList(NVT); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1313,7 +1313,17 @@ let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>; + + // Pseudo instructions to be used when the low result isn't used. The + // instruction is defined to keep the high if both destinations are the same. + def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src), + []>, Sched<[sched.Folded]>; } } diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2193,6 +2193,36 @@ return; } + case X86::MULX32Hrr: + case X86::MULX32Hrm: + case X86::MULX64Hrr: + case X86::MULX64Hrm: { + // Turn into regular MULX by duplicating the destination. + unsigned NewOpc; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break; + case X86::MULX32Hrm: NewOpc = X86::MULX32rr; break; + case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break; + case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break; + } + + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + + // Add the destination twice. + unsigned DestReg = MI->getOperand(0).getReg(); + TmpInst.addOperand(MCOperand::createReg(DestReg)); + TmpInst.addOperand(MCOperand::createReg(DestReg)); + + for (unsigned I = 1; I < MI->getNumOperands(); ++I) + if (auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(I))) + TmpInst.addOperand(*Op); + + EmitAndCountInstruction(TmpInst); + return; + } + // Lower PSHUFB and VPERMILP normally but add a comment if we can find // a constant shuffle mask. We won't be able to do this at the MC layer // because the mask isn't an immediate. diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -839,14 +839,14 @@ ; CHECK-O3-CUR: # %bb.0: ; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx ; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-O3-CUR-NEXT: mulxq %rax, %rcx, %rax +; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax ; CHECK-O3-CUR-NEXT: shrq $3, %rax ; CHECK-O3-CUR-NEXT: retq ; ; CHECK-O3-EX-LABEL: load_fold_udiv1: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-O3-EX-NEXT: mulxq (%rdi), %rcx, %rax +; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax ; CHECK-O3-EX-NEXT: shrq $3, %rax ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8 @@ -1034,9 +1034,9 @@ ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 ; CHECK-O3-NEXT: movq %rax, %rdx -; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rdx -; CHECK-O3-NEXT: shrq $3, %rdx -; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rcx +; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rcx +; CHECK-O3-NEXT: shrq $3, %rcx +; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rcx ; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx ; CHECK-O3-NEXT: subq %rcx, %rax ; CHECK-O3-NEXT: retq @@ -1693,7 +1693,7 @@ ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movq (%rdi), %rdx ; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-O0-NEXT: mulxq %rax, %rcx, %rax +; CHECK-O0-NEXT: mulxq %rax, %rax, %rax ; CHECK-O0-NEXT: shrq $3, %rax ; CHECK-O0-NEXT: movq %rax, (%rdi) ; CHECK-O0-NEXT: retq @@ -1702,17 +1702,17 @@ ; CHECK-O3-CUR: # %bb.0: ; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx ; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rcx -; CHECK-O3-CUR-NEXT: shrq $3, %rcx -; CHECK-O3-CUR-NEXT: movq %rcx, (%rdi) +; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rax +; CHECK-O3-CUR-NEXT: shrq $3, %rax +; CHECK-O3-CUR-NEXT: movq %rax, (%rdi) ; CHECK-O3-CUR-NEXT: retq ; ; CHECK-O3-EX-LABEL: rmw_fold_udiv1: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rcx -; CHECK-O3-EX-NEXT: shrq $3, %rcx -; CHECK-O3-EX-NEXT: movq %rcx, (%rdi) +; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rax +; CHECK-O3-EX-NEXT: shrq $3, %rax +; CHECK-O3-EX-NEXT: movq %rax, (%rdi) ; CHECK-O3-EX-NEXT: retq %prev = load atomic i64, i64* %p unordered, align 8 %val = udiv i64 %prev, 15 @@ -1840,7 +1840,7 @@ ; CHECK-O0-NEXT: movq (%rdi), %rax ; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 ; CHECK-O0-NEXT: movq %rax, %rdx -; CHECK-O0-NEXT: mulxq %rcx, %rdx, %rcx +; CHECK-O0-NEXT: mulxq %rcx, %rcx, %rcx ; CHECK-O0-NEXT: shrq $3, %rcx ; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx ; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx @@ -1852,9 +1852,9 @@ ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movq (%rdi), %rdx ; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 -; CHECK-O3-NEXT: mulxq %rax, %rax, %rcx -; CHECK-O3-NEXT: shrq $3, %rcx -; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rax +; CHECK-O3-NEXT: mulxq %rax, %rax, %rax +; CHECK-O3-NEXT: shrq $3, %rax +; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax ; CHECK-O3-NEXT: subq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -54,7 +54,7 @@ ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl %esi, %edx, %ebx +; X86-BMI-NEXT: mulxl %esi, %ebx, %ebx ; X86-BMI-NEXT: movl %ecx, %edx ; X86-BMI-NEXT: mulxl %esi, %esi, %ebp ; X86-BMI-NEXT: addl %ebx, %esi @@ -85,7 +85,7 @@ ; X64-BMI-LABEL: foo: ; X64-BMI: # %bb.0: ; X64-BMI-NEXT: movq %rdi, %rdx -; X64-BMI-NEXT: mulxq %rsi, %rcx, %rax +; X64-BMI-NEXT: mulxq %rsi, %rax, %rax ; X64-BMI-NEXT: retq %tmp0 = zext i64 %x to i128 %tmp1 = zext i64 %y to i128 diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll --- a/llvm/test/CodeGen/X86/pr35636.ll +++ b/llvm/test/CodeGen/X86/pr35636.ll @@ -7,9 +7,9 @@ ; HSW: # %bb.0: # %bb ; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81 ; HSW-NEXT: movq %rdi, %rdx -; HSW-NEXT: mulxq %rax, %rax, %rcx -; HSW-NEXT: shrq $42, %rcx -; HSW-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1 +; HSW-NEXT: mulxq %rax, %rax, %rax +; HSW-NEXT: shrq $42, %rax +; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 ; HSW-NEXT: shrq $20, %rax ; HSW-NEXT: leal (%rax,%rax,4), %eax ; HSW-NEXT: addl $5, %eax @@ -24,9 +24,9 @@ ; ZN: # %bb.0: # %bb ; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81 ; ZN-NEXT: movq %rdi, %rdx -; ZN-NEXT: mulxq %rax, %rax, %rcx -; ZN-NEXT: shrq $42, %rcx -; ZN-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1 +; ZN-NEXT: mulxq %rax, %rax, %rax +; ZN-NEXT: shrq $42, %rax +; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1 ; ZN-NEXT: shrq $20, %rax ; ZN-NEXT: leal 5(%rax,%rax,4), %eax ; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF