diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -262,6 +262,7 @@ void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); + bool SelectV_BFI(SDNode *N); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -674,6 +674,11 @@ SelectS_BFE(N); return; + case ISD::OR: + if (SelectV_BFI(N)) + return; + + break; case ISD::BRCOND: SelectBRCOND(N); return; @@ -2146,6 +2151,136 @@ SelectCode(N); } +// Try matching a 32-bit expression +// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n) +// with the constants C_i. If matching succeeds, convert +// it to a sequence of nested V_BFI instructions, if possible. +// Creating the canonical bitfieldInsert form in the frontend +// leads to simplification of the used bitmasks through SimplifyDemandedBits, +// where one bitfieldInsert is the base for another bitfieldInsert. +// Originally, the intended expression looks like: +// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3))) +// However, after SimplifyDemandedBits ISel, the expression will look like: +// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3))) +bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) { + if (!N->isDivergent()) + return false; + + struct MaskData { + SDNode *BaseOperand; + SDNode *MaskOperand; + llvm::Optional Mask; + }; + + struct BFIOperandData { + SDNode *Or; + MaskData Mask; + }; + + SmallVector BFIOps; + uint64_t Disjunction = 0; + + auto ExtractMask = [](SDNode *And) -> MaskData { + ConstantSDNode *MaskOperand = dyn_cast(And->getOperand(0)); + SDNode *BaseOperand = And->getOperand(1).getNode(); + if (!MaskOperand) { + BaseOperand = And->getOperand(0).getNode(); + MaskOperand = dyn_cast(And->getOperand(1)); + } + if (!MaskOperand) + return {}; + + return {BaseOperand, MaskOperand, MaskOperand->getZExtValue()}; + }; + + // Check if this is an AND with a constant operand. + // Check if the mask from the constant operand fulfills several + // conditions and update the total mask as well as saving the operand + // nodes. + enum class BFICheckStatus : uint8_t { Proceed, Abort }; + auto CreateBFIOpFromAnd = [&](SDNode *Or, SDNode *And) -> BFICheckStatus { + if (And->getOpcode() != ISD::AND) + return BFICheckStatus::Proceed; + const auto MaskData = ExtractMask(And); + if (!MaskData.Mask.has_value()) + return BFICheckStatus::Proceed; + + // The masks need to be disjoint + if ((Disjunction & MaskData.Mask.value()) != 0) + return BFICheckStatus::Abort; + + Disjunction |= MaskData.Mask.value(); + BFIOps.push_back({Or, MaskData}); + return BFICheckStatus::Proceed; + }; + + // Collect all OR instructions, so the constants masks from the AND + // instructions can be fetched later. + SmallVector OrStack; + SmallVector OrCandidates; + OrStack.push_back(N); + OrCandidates.push_back(N); + auto IsCandidate = [](SDValue Or) -> bool { + return Or->isDivergent() && Or->getOpcode() == ISD::OR && Or->hasOneUse(); + }; + + // Push all possible OR candidates including the function argument on a stack + while (!OrStack.empty()) { + SDNode *Current = OrStack.back(); + OrStack.pop_back(); + + SDValue O0 = Current->getOperand(0); + SDValue O1 = Current->getOperand(1); + + if (IsCandidate(O0)) { + OrStack.push_back(O0.getNode()); + OrCandidates.push_back(O0.getNode()); + } + + if (IsCandidate(O1)) { + OrStack.push_back(O1.getNode()); + OrCandidates.push_back(O1.getNode()); + } + + if (O0.getOpcode() != ISD::OR && O1.getOpcode() != ISD::OR) + break; + } + + // Recording the AND operands and bitmasks and return false, + // if there was a failure (inappropriate bitmask). + // This happens as a separate step so the top-level node does not + // need to be treated separately. + while (!OrCandidates.empty()) { + SDNode *Current = OrCandidates.back(); + OrCandidates.pop_back(); + SDValue LHS = Current->getOperand(0); + SDValue RHS = Current->getOperand(1); + + if (CreateBFIOpFromAnd(Current, RHS.getNode()) == BFICheckStatus::Abort || + CreateBFIOpFromAnd(Current, LHS.getNode()) == BFICheckStatus::Abort) + return false; + } + + // The constants are required to be a partition of -1. + if (Disjunction != 0xFFFFFFFF) + return false; + + // Transform the operands node by node and replace the original node. + // Assume the last node is the base for BFI insertion. + SDNode *BaseNode = BFIOps.front().Mask.BaseOperand; + assert(BaseNode && "Cannot use non-existing node as base!"); + for (auto &BFITuple : makeArrayRef(BFIOps).drop_front()) { + BaseNode = CurDAG->getMachineNode( + AMDGPU::V_BFI_B32_e64, SDLoc(BFITuple.Or), BFITuple.Or->getValueType(0), + {SDValue(BFITuple.Mask.MaskOperand, 0), + SDValue(BFITuple.Mask.BaseOperand, 0), SDValue(BaseNode, 0)}); + } + + ReplaceNode(N, BaseNode); + + return true; +} + void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { switch (N->getOpcode()) { case ISD::AND: diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -7,16 +7,15 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s4, 0xffc00 ; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: s_mov_b32 s4, 0x3ff00000 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] .entry: %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 @@ -41,16 +40,15 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s4, 0xffc00 ; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: s_mov_b32 s4, 0x3ff00000 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] .entry: %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 @@ -75,18 +73,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s4, 0xffc00 ; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 +; GCN-NEXT: s_mov_b32 s5, 0xc000001f ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v2 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s5, v2, v1 +; GCN-NEXT: s_mov_b32 s4, 0x3ff00000 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] .entry: %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 @@ -178,18 +175,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: s_movk_i32 s4, 0x3e0 +; GCN-NEXT: s_mov_b32 s5, 0xffc00 ; GCN-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 5, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0x3e0, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffc00, v1 +; GCN-NEXT: v_bfi_b32 v2, s4, v3, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_bfi_b32 v1, s5, v1, v2 +; GCN-NEXT: s_mov_b32 s4, 0x3ff00000 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] .entry: %y.i32 = fptoui float %y to i32