@@ -9694,8 +9694,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
9694
9694
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
9695
9695
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
9696
9696
{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
9697
- { X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri },
9698
- { X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi },
9699
9697
{ X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
9700
9698
{ X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
9701
9699
{ X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
@@ -9949,6 +9947,24 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
9949
9947
X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
9950
9948
};
9951
9949
9950
+ // NOTE: These should only be used by the custom domain methods.
9951
+ static const uint16_t ReplaceableCustomInstrs[][3] = {
9952
+ //PackedSingle PackedDouble PackedInt
9953
+ { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
9954
+ { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
9955
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi },
9956
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri },
9957
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
9958
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
9959
+ };
9960
+ static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
9961
+ //PackedSingle PackedDouble PackedInt
9962
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
9963
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
9964
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi },
9965
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri },
9966
+ };
9967
+
9952
9968
// FIXME: Some shuffle and unpack instructions have equivalents in different
9953
9969
// domains, but they require a bit more work than just switching opcodes.
9954
9970
@@ -9969,13 +9985,177 @@ static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9969
9985
return nullptr;
9970
9986
}
9971
9987
9988
+ // Helper to attempt to widen/narrow blend masks.
9989
+ static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9990
+ unsigned NewWidth, unsigned *pNewMask = nullptr) {
9991
+ assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9992
+ "Illegal blend mask scale");
9993
+ unsigned NewMask = 0;
9994
+
9995
+ if ((OldWidth % NewWidth) == 0) {
9996
+ unsigned Scale = OldWidth / NewWidth;
9997
+ unsigned SubMask = (1u << Scale) - 1;
9998
+ for (unsigned i = 0; i != NewWidth; ++i) {
9999
+ unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
10000
+ if (Sub == SubMask)
10001
+ NewMask |= (1u << i);
10002
+ else if (Sub != 0x0)
10003
+ return false;
10004
+ }
10005
+ } else {
10006
+ unsigned Scale = NewWidth / OldWidth;
10007
+ unsigned SubMask = (1u << Scale) - 1;
10008
+ for (unsigned i = 0; i != OldWidth; ++i) {
10009
+ if (OldMask & (1 << i)) {
10010
+ NewMask |= (SubMask << (i * Scale));
10011
+ }
10012
+ }
10013
+ }
10014
+
10015
+ if (pNewMask)
10016
+ *pNewMask = NewMask;
10017
+ return true;
10018
+ }
10019
+
10020
+ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
10021
+ unsigned Opcode = MI.getOpcode();
10022
+ unsigned NumOperands = MI.getNumOperands();
10023
+
10024
+ auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
10025
+ uint16_t validDomains = 0;
10026
+ if (MI.getOperand(NumOperands - 1).isImm()) {
10027
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
10028
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
10029
+ validDomains |= 0x2; // PackedSingle
10030
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
10031
+ validDomains |= 0x4; // PackedDouble
10032
+ if (!Is256 || Subtarget.hasAVX2())
10033
+ validDomains |= 0x8; // PackedInt
10034
+ }
10035
+ return validDomains;
10036
+ };
10037
+
10038
+ switch (Opcode) {
10039
+ case X86::BLENDPDrmi:
10040
+ case X86::BLENDPDrri:
10041
+ case X86::VBLENDPDrmi:
10042
+ case X86::VBLENDPDrri:
10043
+ return GetBlendDomains(2, false);
10044
+ case X86::VBLENDPDYrmi:
10045
+ case X86::VBLENDPDYrri:
10046
+ return GetBlendDomains(4, true);
10047
+ case X86::BLENDPSrmi:
10048
+ case X86::BLENDPSrri:
10049
+ case X86::VBLENDPSrmi:
10050
+ case X86::VBLENDPSrri:
10051
+ case X86::VPBLENDDrmi:
10052
+ case X86::VPBLENDDrri:
10053
+ return GetBlendDomains(4, false);
10054
+ case X86::VBLENDPSYrmi:
10055
+ case X86::VBLENDPSYrri:
10056
+ case X86::VPBLENDDYrmi:
10057
+ case X86::VPBLENDDYrri:
10058
+ return GetBlendDomains(8, true);
10059
+ case X86::PBLENDWrmi:
10060
+ case X86::PBLENDWrri:
10061
+ case X86::VPBLENDWrmi:
10062
+ case X86::VPBLENDWrri:
10063
+ // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
10064
+ case X86::VPBLENDWYrmi:
10065
+ case X86::VPBLENDWYrri:
10066
+ return GetBlendDomains(8, false);
10067
+ }
10068
+ return 0;
10069
+ }
10070
+
10071
+ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
10072
+ unsigned Domain) const {
10073
+ assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
10074
+ uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
10075
+ assert(dom && "Not an SSE instruction");
10076
+
10077
+ unsigned Opcode = MI.getOpcode();
10078
+ unsigned NumOperands = MI.getNumOperands();
10079
+
10080
+ auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
10081
+ if (MI.getOperand(NumOperands - 1).isImm()) {
10082
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
10083
+ Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
10084
+ unsigned NewImm = Imm;
10085
+
10086
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
10087
+ if (!table)
10088
+ table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10089
+
10090
+ if (Domain == 1) { // PackedSingle
10091
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10092
+ } else if (Domain == 2) { // PackedDouble
10093
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
10094
+ } else if (Domain == 3) { // PackedInt
10095
+ if (Subtarget.hasAVX2()) {
10096
+ // If we are already VPBLENDW use that, else use VPBLENDD.
10097
+ if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
10098
+ table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10099
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10100
+ }
10101
+ } else {
10102
+ assert(!Is256 && "128-bit vector expected");
10103
+ AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
10104
+ }
10105
+ }
10106
+
10107
+ assert(table && table[Domain - 1] && "Unknown domain op");
10108
+ MI.setDesc(get(table[Domain - 1]));
10109
+ MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
10110
+ }
10111
+ return true;
10112
+ };
10113
+
10114
+ switch (Opcode) {
10115
+ case X86::BLENDPDrmi:
10116
+ case X86::BLENDPDrri:
10117
+ case X86::VBLENDPDrmi:
10118
+ case X86::VBLENDPDrri:
10119
+ return SetBlendDomain(2, false);
10120
+ case X86::VBLENDPDYrmi:
10121
+ case X86::VBLENDPDYrri:
10122
+ return SetBlendDomain(4, true);
10123
+ case X86::BLENDPSrmi:
10124
+ case X86::BLENDPSrri:
10125
+ case X86::VBLENDPSrmi:
10126
+ case X86::VBLENDPSrri:
10127
+ case X86::VPBLENDDrmi:
10128
+ case X86::VPBLENDDrri:
10129
+ return SetBlendDomain(4, false);
10130
+ case X86::VBLENDPSYrmi:
10131
+ case X86::VBLENDPSYrri:
10132
+ case X86::VPBLENDDYrmi:
10133
+ case X86::VPBLENDDYrri:
10134
+ return SetBlendDomain(8, true);
10135
+ case X86::PBLENDWrmi:
10136
+ case X86::PBLENDWrri:
10137
+ case X86::VPBLENDWrmi:
10138
+ case X86::VPBLENDWrri:
10139
+ return SetBlendDomain(8, false);
10140
+ case X86::VPBLENDWYrmi:
10141
+ case X86::VPBLENDWYrri:
10142
+ return SetBlendDomain(16, true);
10143
+ }
10144
+ return false;
10145
+ }
10146
+
9972
10147
std::pair<uint16_t, uint16_t>
9973
10148
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
9974
10149
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9975
10150
unsigned opcode = MI.getOpcode();
9976
10151
uint16_t validDomains = 0;
9977
10152
if (domain) {
9978
- if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
10153
+ // Attempt to match for custom instructions.
10154
+ if (validDomains = getExecutionDomainCustom(MI)) {
10155
+ return std::make_pair(domain, validDomains);
10156
+ }
10157
+
10158
+ if (lookup(opcode, domain, ReplaceableInstrs)) {
9979
10159
validDomains = 0xe;
9980
10160
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9981
10161
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
@@ -10007,6 +10187,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
10007
10187
assert(Domain>0 && Domain<4 && "Invalid execution domain");
10008
10188
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
10009
10189
assert(dom && "Not an SSE instruction");
10190
+
10191
+ // Attempt to match for custom instructions.
10192
+ if (setExecutionDomainCustom(MI, Domain))
10193
+ return;
10194
+
10010
10195
const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
10011
10196
if (!table) { // try the other table
10012
10197
assert((Subtarget.hasAVX2() || Domain < 3) &&
0 commit comments