Skip to content

Commit 940eae3

Browse files
committedJan 15, 2018
[X86][SSE] Add custom execution domain fixing for BLENDPD/BLENDPS/PBLENDD/PBLENDW (PR34873)
Add support for custom execution domain fixing and implement support for BLENDPD/BLENDPS/PBLENDD/PBLENDW. Differential Revision: https://reviews.llvm.org/D42042 llvm-svn: 322524
1 parent 30265d0 commit 940eae3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1013
-1159
lines changed
 

Diff for: ‎llvm/lib/Target/X86/X86InstrInfo.cpp

+188-3
Original file line numberDiff line numberDiff line change
@@ -9694,8 +9694,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
96949694
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
96959695
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
96969696
{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
9697-
{ X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri },
9698-
{ X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi },
96999697
{ X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
97009698
{ X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
97019699
{ X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
@@ -9949,6 +9947,24 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
99499947
X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
99509948
};
99519949

9950+
// NOTE: These should only be used by the custom domain methods.
9951+
static const uint16_t ReplaceableCustomInstrs[][3] = {
9952+
//PackedSingle PackedDouble PackedInt
9953+
{ X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
9954+
{ X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
9955+
{ X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi },
9956+
{ X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri },
9957+
{ X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
9958+
{ X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
9959+
};
9960+
static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
9961+
//PackedSingle PackedDouble PackedInt
9962+
{ X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
9963+
{ X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
9964+
{ X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi },
9965+
{ X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri },
9966+
};
9967+
99529968
// FIXME: Some shuffle and unpack instructions have equivalents in different
99539969
// domains, but they require a bit more work than just switching opcodes.
99549970

@@ -9969,13 +9985,177 @@ static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
99699985
return nullptr;
99709986
}
99719987

9988+
// Helper to attempt to widen/narrow blend masks.
9989+
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9990+
unsigned NewWidth, unsigned *pNewMask = nullptr) {
9991+
assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9992+
"Illegal blend mask scale");
9993+
unsigned NewMask = 0;
9994+
9995+
if ((OldWidth % NewWidth) == 0) {
9996+
unsigned Scale = OldWidth / NewWidth;
9997+
unsigned SubMask = (1u << Scale) - 1;
9998+
for (unsigned i = 0; i != NewWidth; ++i) {
9999+
unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
10000+
if (Sub == SubMask)
10001+
NewMask |= (1u << i);
10002+
else if (Sub != 0x0)
10003+
return false;
10004+
}
10005+
} else {
10006+
unsigned Scale = NewWidth / OldWidth;
10007+
unsigned SubMask = (1u << Scale) - 1;
10008+
for (unsigned i = 0; i != OldWidth; ++i) {
10009+
if (OldMask & (1 << i)) {
10010+
NewMask |= (SubMask << (i * Scale));
10011+
}
10012+
}
10013+
}
10014+
10015+
if (pNewMask)
10016+
*pNewMask = NewMask;
10017+
return true;
10018+
}
10019+
10020+
uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
10021+
unsigned Opcode = MI.getOpcode();
10022+
unsigned NumOperands = MI.getNumOperands();
10023+
10024+
auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
10025+
uint16_t validDomains = 0;
10026+
if (MI.getOperand(NumOperands - 1).isImm()) {
10027+
unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
10028+
if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
10029+
validDomains |= 0x2; // PackedSingle
10030+
if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
10031+
validDomains |= 0x4; // PackedDouble
10032+
if (!Is256 || Subtarget.hasAVX2())
10033+
validDomains |= 0x8; // PackedInt
10034+
}
10035+
return validDomains;
10036+
};
10037+
10038+
switch (Opcode) {
10039+
case X86::BLENDPDrmi:
10040+
case X86::BLENDPDrri:
10041+
case X86::VBLENDPDrmi:
10042+
case X86::VBLENDPDrri:
10043+
return GetBlendDomains(2, false);
10044+
case X86::VBLENDPDYrmi:
10045+
case X86::VBLENDPDYrri:
10046+
return GetBlendDomains(4, true);
10047+
case X86::BLENDPSrmi:
10048+
case X86::BLENDPSrri:
10049+
case X86::VBLENDPSrmi:
10050+
case X86::VBLENDPSrri:
10051+
case X86::VPBLENDDrmi:
10052+
case X86::VPBLENDDrri:
10053+
return GetBlendDomains(4, false);
10054+
case X86::VBLENDPSYrmi:
10055+
case X86::VBLENDPSYrri:
10056+
case X86::VPBLENDDYrmi:
10057+
case X86::VPBLENDDYrri:
10058+
return GetBlendDomains(8, true);
10059+
case X86::PBLENDWrmi:
10060+
case X86::PBLENDWrri:
10061+
case X86::VPBLENDWrmi:
10062+
case X86::VPBLENDWrri:
10063+
// Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
10064+
case X86::VPBLENDWYrmi:
10065+
case X86::VPBLENDWYrri:
10066+
return GetBlendDomains(8, false);
10067+
}
10068+
return 0;
10069+
}
10070+
10071+
bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
10072+
unsigned Domain) const {
10073+
assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
10074+
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
10075+
assert(dom && "Not an SSE instruction");
10076+
10077+
unsigned Opcode = MI.getOpcode();
10078+
unsigned NumOperands = MI.getNumOperands();
10079+
10080+
auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
10081+
if (MI.getOperand(NumOperands - 1).isImm()) {
10082+
unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
10083+
Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
10084+
unsigned NewImm = Imm;
10085+
10086+
const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
10087+
if (!table)
10088+
table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10089+
10090+
if (Domain == 1) { // PackedSingle
10091+
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10092+
} else if (Domain == 2) { // PackedDouble
10093+
AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
10094+
} else if (Domain == 3) { // PackedInt
10095+
if (Subtarget.hasAVX2()) {
10096+
// If we are already VPBLENDW use that, else use VPBLENDD.
10097+
if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
10098+
table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10099+
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10100+
}
10101+
} else {
10102+
assert(!Is256 && "128-bit vector expected");
10103+
AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
10104+
}
10105+
}
10106+
10107+
assert(table && table[Domain - 1] && "Unknown domain op");
10108+
MI.setDesc(get(table[Domain - 1]));
10109+
MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
10110+
}
10111+
return true;
10112+
};
10113+
10114+
switch (Opcode) {
10115+
case X86::BLENDPDrmi:
10116+
case X86::BLENDPDrri:
10117+
case X86::VBLENDPDrmi:
10118+
case X86::VBLENDPDrri:
10119+
return SetBlendDomain(2, false);
10120+
case X86::VBLENDPDYrmi:
10121+
case X86::VBLENDPDYrri:
10122+
return SetBlendDomain(4, true);
10123+
case X86::BLENDPSrmi:
10124+
case X86::BLENDPSrri:
10125+
case X86::VBLENDPSrmi:
10126+
case X86::VBLENDPSrri:
10127+
case X86::VPBLENDDrmi:
10128+
case X86::VPBLENDDrri:
10129+
return SetBlendDomain(4, false);
10130+
case X86::VBLENDPSYrmi:
10131+
case X86::VBLENDPSYrri:
10132+
case X86::VPBLENDDYrmi:
10133+
case X86::VPBLENDDYrri:
10134+
return SetBlendDomain(8, true);
10135+
case X86::PBLENDWrmi:
10136+
case X86::PBLENDWrri:
10137+
case X86::VPBLENDWrmi:
10138+
case X86::VPBLENDWrri:
10139+
return SetBlendDomain(8, false);
10140+
case X86::VPBLENDWYrmi:
10141+
case X86::VPBLENDWYrri:
10142+
return SetBlendDomain(16, true);
10143+
}
10144+
return false;
10145+
}
10146+
997210147
std::pair<uint16_t, uint16_t>
997310148
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
997410149
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
997510150
unsigned opcode = MI.getOpcode();
997610151
uint16_t validDomains = 0;
997710152
if (domain) {
9978-
if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
10153+
// Attempt to match for custom instructions.
10154+
if (validDomains = getExecutionDomainCustom(MI)) {
10155+
return std::make_pair(domain, validDomains);
10156+
}
10157+
10158+
if (lookup(opcode, domain, ReplaceableInstrs)) {
997910159
validDomains = 0xe;
998010160
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
998110161
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
@@ -10007,6 +10187,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
1000710187
assert(Domain>0 && Domain<4 && "Invalid execution domain");
1000810188
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
1000910189
assert(dom && "Not an SSE instruction");
10190+
10191+
// Attempt to match for custom instructions.
10192+
if (setExecutionDomainCustom(MI, Domain))
10193+
return;
10194+
1001010195
const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
1001110196
if (!table) { // try the other table
1001210197
assert((Subtarget.hasAVX2() || Domain < 3) &&

Diff for: ‎llvm/lib/Target/X86/X86InstrInfo.h

+4
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
490490
std::pair<uint16_t, uint16_t>
491491
getExecutionDomain(const MachineInstr &MI) const override;
492492

493+
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
494+
493495
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
494496

497+
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
498+
495499
unsigned
496500
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
497501
const TargetRegisterInfo *TRI) const override;

0 commit comments

Comments
 (0)
Please sign in to comment.