# Please use GitHub pull requests for new patches. Avoid migrating existing patches. Phabricator shutdown timeline

# Changeset View

# Standalone View

# llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 2,041 Lines • ▼ Show 20 Lines | return X && NotX && | ||||

~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); | ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); | ||||

}] | }] | ||||

>; | >; | ||||

// Definition from ISA doc: | // Definition from ISA doc: | ||||

// (y & x) | (z & ~x) | // (y & x) | (z & ~x) | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), | (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), | ||||

(V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) | (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), | ||||

(COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), | |||||

(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) | |||||

>; | >; | ||||

arsenm: I think this needs to go off the a predicate. If we have to generate so many copies it's… | |||||

Do you mean they should be on the matching part ( I don't remember how we want to handle this in GISel. Do we want to be aware of constant-bus limitations at matching time, or ignore it/always respect it by greedily inserting copies and cleaning it up later (with SIFoldOperands and an eventual improved version) FWIW, I was thinking about adding a "Finalizer" method to GISel Pattern Matching to allow targets to call some C++ code before the pattern is applied/instructions are built so it can "veto" the pattern if needed and fail if it's non-profitable for instance. I didn't make an RFC yet but if it sounds like something that would be useful I can make one in the coming days/week. Pierre-vh: Do you mean they should be on the matching part (`DivergentBinFrag<...`) and not on the output… | |||||

Not Done ReplyInline ActionsIt's partially an open question for how to handle the constant bus problem. The current strategy is supposed to be let regbankselect aggressively emit copies to VGPR up front so it's impossible to violate, which SIFoldOperands can clean up. In the case of patterns, I think it would be worse if we had to manually write all of them out in C++ to handle them in SIFoldOperands. Selection patterns should be applying logic to avoid violating it. The finalizer sounds like the same as the current arbitrary code predicates? arsenm: It's partially an open question for how to handle the constant bus problem. The current… | |||||

So what I did here follows the current strategy, right? It aggressively copies to VGPRs and lets SIFoldOperands clean up. I suppose an alternative could be to add some PatFrag(s) with GISel code with a heuristic to prevent matching if more than X copies cannot be folded but it feels fragile (because we'd need an arbitrary limit). What do you think? I also don't think there's a test case here where matching BFI leads to worse code due to too many copies. I think the odds of most of the copies being folded-out are pretty good. There's sometimes one left but it's better than having 3 or 4 instructions to do what BFI can do in one. Pierre-vh: So what I did here follows the current strategy, right? It aggressively copies to VGPRs and… | |||||

// (y & C) | (z & ~C) | // (y & C) | (z & ~C) | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(BFIImm32 i32:$x, i32:$y, i32:$z), | (BFIImm32 i32:$x, i32:$y, i32:$z), | ||||

(V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) | (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) | ||||

>; | >; | ||||

// 64-bit version | // 64-bit version | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), | (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), | ||||

(REG_SEQUENCE VReg_64, | (REG_SEQUENCE VReg_64, | ||||

(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), | (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, | (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, | ||||

(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), | (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), | (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) | (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) | ||||

>; | >; | ||||

// SHA-256 Ch function | // SHA-256 Ch function | ||||

// z ^ (x & (y ^ z)) | // z ^ (x & (y ^ z)) | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), | (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), | ||||

(V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) | (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), | ||||

(COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), | |||||

(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) | |||||

>; | >; | ||||

// 64-bit version | // 64-bit version | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), | (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), | ||||

(REG_SEQUENCE VReg_64, | (REG_SEQUENCE VReg_64, | ||||

(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), | (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), | ||||

▲ Show 20 Lines • Show All 1,099 Lines • ▼ Show 20 Lines | |||||

>; | >; | ||||

// SHA-256 Ma patterns | // SHA-256 Ma patterns | ||||

// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y | // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<or> (and i32:$x, i32:$z), | (DivergentBinFrag<or> (and i32:$x, i32:$z), | ||||

(and i32:$y, (or i32:$x, i32:$z))), | (and i32:$y, (or i32:$x, i32:$z))), | ||||

(V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) | (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), | ||||

(COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), | |||||

(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), | |||||

(COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) | |||||

>; | >; | ||||

def : AMDGPUPat < | def : AMDGPUPat < | ||||

(DivergentBinFrag<or> (and i64:$x, i64:$z), | (DivergentBinFrag<or> (and i64:$x, i64:$z), | ||||

(and i64:$y, (or i64:$x, i64:$z))), | (and i64:$y, (or i64:$x, i64:$z))), | ||||

(REG_SEQUENCE VReg_64, | (REG_SEQUENCE VReg_64, | ||||

(V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), | ||||

(i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), | (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), | ||||

▲ Show 20 Lines • Show All 434 Lines • Show Last 20 Lines |

I think this needs to go off the a predicate. If we have to generate so many copies it's potentially worse than matching the pattern