diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -262,6 +262,7 @@ void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); + bool SelectV_BFI(SDNode *N); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -674,6 +674,11 @@ SelectS_BFE(N); return; + case ISD::OR: + if (SelectV_BFI(N)) + return; + + break; case ISD::BRCOND: SelectBRCOND(N); return; @@ -2146,6 +2151,89 @@ SelectCode(N); } +// Try matching a 32-bit expression +// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n) +// with the constants C_i. If matching succeeds, convert +// it to a sequence of nested V_BFI instructions, if possible. +// Creating the canonical bitfieldInsert form in the frontend +// leads to simplification of the used bitmasks through SimplifyDemandedBits, +// where one bitfieldInsert is the base for another bitfieldInsert. +// Originally, the intended expression looks like: +// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3))) +// However, after SimplifyDemandedBits ISel, the expression will look like: +// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3))) +bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) { + struct BFIOperandData { + SDNode *X; + uint32_t CVal; + }; + + SmallVector BFIOps; + uint32_t Disjunction = 0; + + auto InsertBFIOp = [&](SDNode *And) -> bool { + if (!And->hasOneUse()) + return false; + + ConstantSDNode *Base = dyn_cast(And->getOperand(1)); + if (!Base) + return false; + + const uint32_t Mask = Base->getZExtValue(); + // The masks need to be disjoint + if ((Disjunction & Mask) != 0) + return false; + + Disjunction |= Mask; + BFIOps.push_back({And, Mask}); + return true; + }; + + // Fetch all AND nodes as well as the values from their constant operands + SDNode *CurrentOr = N; + while (CurrentOr && (CurrentOr == N || CurrentOr->hasOneUse())) { + SDValue O0 = CurrentOr->getOperand(0); + SDValue O1 = CurrentOr->getOperand(1); + if (O1.getOpcode() != ISD::AND) + return false; + + if (O0.getOpcode() == ISD::OR) { + if (!InsertBFIOp(O1.getNode())) + return false; + + CurrentOr = O0.getNode(); + } else if (O0.getOpcode() == ISD::AND) { + if (!InsertBFIOp(O0.getNode()) || !InsertBFIOp(O1.getNode())) + return false; + + break; + } else { + return false; + } + } + + // Require that at least two nesting levels have been visited + // The constants are required to be a partition of -1 + if (CurrentOr == N || Disjunction != 0xFFFFFFFF) + return false; + + // Transform the operands node by node and replace the original node. + // Start with the bottom-most node. + SDNode *NewNode = BFIOps.back().X->getOperand(0).getNode(); + + assert(NewNode && "Cannot use non-existing node as base!"); + for (auto &BFITuple : reverse(makeArrayRef(BFIOps).drop_back())) { + NewNode = CurDAG->getMachineNode( + AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0), + {BFITuple.X->getOperand(1), BFITuple.X->getOperand(0), + SDValue(NewNode, 0)}); + } + + ReplaceNode(N, NewNode); + + return true; +} + void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { switch (N->getOpcode()) { case ISD::AND: diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -15,17 +15,15 @@ ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] .entry: %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 @@ -114,18 +112,16 @@ ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 5, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_bfi_b32 v2, 0x3e0, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GFX10-NEXT: v_and_or_b32 v2, 0x3e0, v3, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] .entry: %y.i32 = fptoui float %y to i32