diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3635,7 +3635,7 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { if (ForcedVEXEncoding != VEXEncoding_VEX3 && - X86::optimizeInstFromVEX3ToVEX2(Inst)) + X86::optimizeInstFromVEX3ToVEX2(Inst, MII.get(Inst.getOpcode()))) return true; if (X86::optimizeShiftRotateWithImmediateOne(Inst)) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.h b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.h --- a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.h @@ -14,8 +14,9 @@ #define LLVM_LIB_TARGET_X86_X86ENCODINGOPTIMIZATION_H namespace llvm { class MCInst; +class MCInstrDesc; namespace X86 { -bool optimizeInstFromVEX3ToVEX2(MCInst &MI); +bool optimizeInstFromVEX3ToVEX2(MCInst &MI, const MCInstrDesc &Desc); bool optimizeShiftRotateWithImmediateOne(MCInst &MI); } // namespace X86 } // namespace llvm diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp @@ -13,12 +13,19 @@ #include "X86EncodingOptimization.h" #include "X86BaseInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" using namespace llvm; -bool X86::optimizeInstFromVEX3ToVEX2(MCInst &MI) { +static bool shouldExchange(const MCInst &MI, unsigned OpIdx1, unsigned OpIdx2) { + return !X86II::isX86_64ExtendedReg(MI.getOperand(OpIdx1).getReg()) && + X86II::isX86_64ExtendedReg(MI.getOperand(OpIdx2).getReg()); +} + +bool X86::optimizeInstFromVEX3ToVEX2(MCInst &MI, const MCInstrDesc &Desc) { unsigned OpIdx1, OpIdx2; unsigned NewOpc; + unsigned Opcode = MI.getOpcode(); #define FROM_TO(FROM, TO, IDX1, IDX2) \ case X86::FROM: \ NewOpc = X86::TO; \ @@ -27,8 +34,26 @@ break; #define TO_REV(FROM) FROM_TO(FROM, FROM##_REV, 0, 1) switch (MI.getOpcode()) { - default: - return false; + default: { + // If the instruction is a commutable arithmetic instruction we might be + // able to commute the operands to get a 2 byte VEX prefix. + uint64_t TSFlags = Desc.TSFlags; + if (!Desc.isCommutable() || (TSFlags & X86II::EncodingMask) != X86II::VEX || + (TSFlags & X86II::OpMapMask) != X86II::TB || + (TSFlags & X86II::FormMask) != X86II::MRMSrcReg || + (TSFlags & X86II::REX_W) || !(TSFlags & X86II::VEX_4V) || + MI.getNumOperands() != 3) + return false; + // These two are not truly commutable. + if (Opcode == X86::VMOVHLPSrr || Opcode == X86::VUNPCKHPDrr) + return false; + OpIdx1 = 1; + OpIdx2 = 2; + if (!shouldExchange(MI, OpIdx1, OpIdx2)) + return false; + std::swap(MI.getOperand(OpIdx1), MI.getOperand(OpIdx2)); + return true; + } // Commute operands to get a smaller encoding by using VEX.R instead of // VEX.B if one of the registers is extended, but other isn't. FROM_TO(VMOVZPQILo2PQIrr, VMOVPQI2QIrr, 0, 1) @@ -51,8 +76,7 @@ #undef TO_REV #undef FROM_TO } - if (X86II::isX86_64ExtendedReg(MI.getOperand(OpIdx1).getReg()) || - !X86II::isX86_64ExtendedReg(MI.getOperand(OpIdx2).getReg())) + if (!shouldExchange(MI, OpIdx1, OpIdx2)) return false; MI.setOpcode(NewOpc); return true; diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -502,7 +502,7 @@ if (auto MaybeMCOp = LowerMachineOperand(MI, MO)) OutMI.addOperand(*MaybeMCOp); - if (X86::optimizeInstFromVEX3ToVEX2(OutMI)) + if (X86::optimizeInstFromVEX3ToVEX2(OutMI, MI->getDesc())) return; // Handle a few special cases to eliminate operand modifiers. @@ -905,12 +905,6 @@ } break; } - - case X86::VMOVHLPSrr: - case X86::VUNPCKHPDrr: - // These are not truly commutable so hide them from the default case. - break; - case X86::MASKMOVDQU: case X86::VMASKMOVDQU: if (AsmPrinter.getSubtarget().is64Bit()) @@ -918,19 +912,6 @@ break; default: { - // If the instruction is a commutable arithmetic instruction we might be - // able to commute the operands to get a 2 byte VEX prefix. - uint64_t TSFlags = MI->getDesc().TSFlags; - if (MI->getDesc().isCommutable() && - (TSFlags & X86II::EncodingMask) == X86II::VEX && - (TSFlags & X86II::OpMapMask) == X86II::TB && - (TSFlags & X86II::FormMask) == X86II::MRMSrcReg && - !(TSFlags & X86II::REX_W) && (TSFlags & X86II::VEX_4V) && - OutMI.getNumOperands() == 3) { - if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) && - X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) - std::swap(OutMI.getOperand(1), OutMI.getOperand(2)); - } // Add an REP prefix to BSF instructions so that new processors can // recognize as TZCNT, which has better performance than BSF. if (X86::isBSF(OutMI.getOpcode()) && !MF.getFunction().hasOptSize()) { diff --git a/llvm/test/MC/X86/x86_64-avx-encoding.s b/llvm/test/MC/X86/x86_64-avx-encoding.s --- a/llvm/test/MC/X86/x86_64-avx-encoding.s +++ b/llvm/test/MC/X86/x86_64-avx-encoding.s @@ -3168,20 +3168,20 @@ // CHECK: encoding: [0xc4,0xc1,0x5d,0x5e,0xf4] vdivpd %ymm12, %ymm4, %ymm6 -// CHECK: vaddps %ymm12, %ymm4, %ymm6 -// CHECK: encoding: [0xc4,0xc1,0x5c,0x58,0xf4] +// CHECK: vaddps %ymm4, %ymm12, %ymm6 +// CHECK: encoding: [0xc5,0x9c,0x58,0xf4] vaddps %ymm12, %ymm4, %ymm6 -// CHECK: vaddpd %ymm12, %ymm4, %ymm6 -// CHECK: encoding: [0xc4,0xc1,0x5d,0x58,0xf4] +// CHECK: vaddpd %ymm4, %ymm12, %ymm6 +// CHECK: encoding: [0xc5,0x9d,0x58,0xf4] vaddpd %ymm12, %ymm4, %ymm6 -// CHECK: vmulps %ymm12, %ymm4, %ymm6 -// CHECK: encoding: [0xc4,0xc1,0x5c,0x59,0xf4] +// CHECK: vmulps %ymm4, %ymm12, %ymm6 +// CHECK: encoding: [0xc5,0x9c,0x59,0xf4] vmulps %ymm12, %ymm4, %ymm6 -// CHECK: vmulpd %ymm12, %ymm4, %ymm6 -// CHECK: encoding: [0xc4,0xc1,0x5d,0x59,0xf4] +// CHECK: vmulpd %ymm4, %ymm12, %ymm6 +// CHECK: encoding: [0xc5,0x9d,0x59,0xf4] vmulpd %ymm12, %ymm4, %ymm6 // CHECK: vmaxps (%rax), %ymm4, %ymm6 diff --git a/llvm/test/tools/llvm-mca/X86/show-encoding.s b/llvm/test/tools/llvm-mca/X86/show-encoding.s --- a/llvm/test/tools/llvm-mca/X86/show-encoding.s +++ b/llvm/test/tools/llvm-mca/X86/show-encoding.s @@ -51,7 +51,7 @@ # NORMAL-NEXT: 1 1 0.50 vpslldq $12, %xmm3, %xmm5 # NORMAL-NEXT: 1 3 1.00 vaddps %xmm4, %xmm5, %xmm7 # NORMAL-NEXT: 1 3 1.00 vaddps %xmm6, %xmm7, %xmm8 -# NORMAL-NEXT: 1 3 1.00 vaddps %xmm8, %xmm0, %xmm9 +# NORMAL-NEXT: 1 3 1.00 vaddps %xmm0, %xmm8, %xmm9 # NORMAL-NEXT: 1 1 0.50 vshufps $255, %xmm9, %xmm9, %xmm0 # NORMAL-NEXT: 1 1 1.00 * vmovups %xmm9, (%r11,%r9,4) # NORMAL-NEXT: 1 1 0.50 cmpl %r8d, %esi @@ -70,7 +70,7 @@ # WITHENCODINGS-NEXT: 1 1 0.50 5 c5 d1 73 fb 0c vpslldq $12, %xmm3, %xmm5 # WITHENCODINGS-NEXT: 1 3 1.00 4 c5 d0 58 fc vaddps %xmm4, %xmm5, %xmm7 # WITHENCODINGS-NEXT: 1 3 1.00 4 c5 40 58 c6 vaddps %xmm6, %xmm7, %xmm8 -# WITHENCODINGS-NEXT: 1 3 1.00 5 c4 41 78 58 c8 vaddps %xmm8, %xmm0, %xmm9 +# WITHENCODINGS-NEXT: 1 3 1.00 4 c5 38 58 c8 vaddps %xmm0, %xmm8, %xmm9 # WITHENCODINGS-NEXT: 1 1 0.50 6 c4 c1 30 c6 c1 ff vshufps $255, %xmm9, %xmm9, %xmm0 # WITHENCODINGS-NEXT: 1 1 1.00 * 6 c4 01 78 11 0c 8b vmovups %xmm9, (%r11,%r9,4) # WITHENCODINGS-NEXT: 1 1 0.50 3 44 39 c6 cmpl %r8d, %esi