diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -57,6 +57,7 @@ private: const X86InstrInfo *TII = nullptr; const X86Subtarget *ST = nullptr; + const MCSchedModel *SM = nullptr; }; } // end anonymous namespace @@ -68,6 +69,14 @@ return new X86FixupInstTuningPass(); } +template +static std::optional CmpOptionals(T NewVal, T CurVal) { + if (NewVal != std::nullopt && CurVal != std::nullopt && *NewVal != *CurVal) + return *NewVal < *CurVal; + + return std::nullopt; +} + bool X86FixupInstTuningPass::processInstruction( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) { @@ -75,10 +84,59 @@ unsigned Opc = MI.getOpcode(); unsigned NumOperands = MI.getDesc().getNumOperands(); + auto GetInstSchedInfo = [&](unsigned Opcode, + bool Lat) -> std::optional { + if (SM->hasInstrSchedModel()) { + const MCSchedClassDesc *SchedClassDesc = + SM->getSchedClassDesc(TII->get(Opcode).getSchedClass()); + return Lat ? MCSchedModel::computeInstrLatency(*ST, *SchedClassDesc) + : MCSchedModel::getReciprocalThroughput(*ST, *SchedClassDesc); + } + return std::nullopt; + }; + + auto GetInstTput = [&](unsigned Opcode) -> std::optional { + return GetInstSchedInfo(Opcode, /*Lat*/ false); + }; + + auto GetInstLat = [&](unsigned Opcode) -> std::optional { + return GetInstSchedInfo(Opcode, /*Lat*/ true); + }; + + auto GetInstSize = [&](unsigned Opcode) -> std::optional { + unsigned Size = TII->get(Opcode).getSize(); + if (Size) + return Size; + // Zero size means we where unable to compute it. + return std::nullopt; + }; + + auto NewOpcPreferable = [&](unsigned NewOpc, + bool ReplaceInTie = true) -> bool { + // Compare tput -> lat -> code size. + auto Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc)); + if (Res != std::nullopt) + return *Res; + + Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc)); + if (Res != std::nullopt) + return *Res; + + Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc)); + if (Res != std::nullopt) + return *Res; + + // We either have either were unable to get tput/lat/codesize or all values + // were equal. Return specified option for a tie. + return ReplaceInTie; + }; + // `vpermilps r, i` -> `vshufps r, r, i` - // `vshufps` is always as fast or faster than `vpermilps` and takes 1 less - // byte of code size. + // `vshufps` is always as fast or faster than + // `vpermilps` and takes 1 less byte of code size. auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { + if (!NewOpcPreferable(NewOpc)) + return false; unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); MI.removeOperand(NumOperands - 1); MI.addOperand(MI.getOperand(1)); @@ -93,12 +151,34 @@ auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as // `vpshufd` saves a byte of code size. - if (!ST->hasNoDomainDelayShuffle()) + if (!ST->hasNoDomainDelayShuffle() && + !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) return false; MI.setDesc(TII->get(NewOpc)); return true; }; + // `vunpcklpd/vmovlhps r, r` -> `vshufps r, r, 0x44` + // `vunpckhpd/vmovlhps r, r` -> `vshufps r, r, 0xee` + // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with + // `vunpck{l|h}pd` as it uses less code size. + // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS` + // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost. + auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool { + if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) + return false; + + MI.setDesc(TII->get(NewOpc)); + MI.addOperand(MachineOperand::CreateImm(MaskImm)); + return true; + }; + auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool { + return ProcessUNPCKPD(NewOpc, 0x44); + }; + auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool { + return ProcessUNPCKPD(NewOpc, 0xee); + }; + // TODO: Add masked predicate execution variants. switch (Opc) { case X86::VPERMILPSri: @@ -121,6 +201,41 @@ return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); case X86::VPERMILPSZmi: return ProcessVPERMILPSmi(X86::VPSHUFDZmi); + + // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to + // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as + // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also + // handle the `mr` case. ICL doesn't have a domain penalty for replacing + // float unpck -> int unpck, but at this time, I haven't verified the set of + // processors where its safe. + case X86::MOVLHPSrr: + case X86::UNPCKLPDrr: + return ProcessUNPCKLPDrr(X86::SHUFPSrri); + case X86::VMOVLHPSrr: + case X86::VUNPCKLPDrr: + return ProcessUNPCKLPDrr(X86::VSHUFPSrri); + case X86::VUNPCKLPDYrr: + return ProcessUNPCKLPDrr(X86::VSHUFPSYrri); + // VMOVLHPS is always 128 bits. + case X86::VMOVLHPSZrr: + case X86::VUNPCKLPDZ128rr: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ128rri); + case X86::VUNPCKLPDZ256rr: + return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rri); + case X86::VUNPCKLPDZrr: + return ProcessUNPCKLPDrr(X86::VSHUFPSZrri); + case X86::UNPCKHPDrr: + return ProcessUNPCKHPDrr(X86::SHUFPSrri); + case X86::VUNPCKHPDrr: + return ProcessUNPCKHPDrr(X86::VSHUFPSrri); + case X86::VUNPCKHPDYrr: + return ProcessUNPCKHPDrr(X86::VSHUFPSYrri); + case X86::VUNPCKHPDZ128rr: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ128rri); + case X86::VUNPCKHPDZ256rr: + return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rri); + case X86::VUNPCKHPDZrr: + return ProcessUNPCKHPDrr(X86::VSHUFPSZrri); default: return false; } @@ -131,6 +246,8 @@ bool Changed = false; ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); + SM = &ST->getSchedModel(); + for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { if (processInstruction(MF, MBB, I)) { diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll @@ -36,42 +36,119 @@ } define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrr: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrr: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKX-NEXT: retq +; +; CHECK-SKL-LABEL: transform_VUNPCKLPDYrr: +; CHECK-SKL: # %bb.0: +; CHECK-SKL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKL-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrr: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ZNVER4-NEXT: retq %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrr: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrr: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKX-NEXT: retq +; +; CHECK-SKL-LABEL: transform_VUNPCKHPDYrr: +; CHECK-SKL: # %bb.0: +; CHECK-SKL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKL-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrr: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ZNVER4-NEXT: retq %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrr: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrr: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: retq +; +; CHECK-SKL-LABEL: transform_VUNPCKLPDrr: +; CHECK-SKL: # %bb.0: +; CHECK-SKL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SKL-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDrr: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrr: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrr: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-ZNVER4-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrr: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrr: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-SKX-NEXT: retq +; +; CHECK-SKL-LABEL: transform_VUNPCKHPDrr: +; CHECK-SKL: # %bb.0: +; CHECK-SKL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-SKL-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDrr: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrr: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrr: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-ZNVER4-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-ICX: {{.*}} -; CHECK-SKX: {{.*}} -; CHECK-V4: {{.*}} -; CHECK-ZNVER4: {{.*}} +; CHECK: {{.*}}