diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -262,6 +262,7 @@ void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); + bool SelectV_BFI(SDNode *N); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -674,6 +674,11 @@ SelectS_BFE(N); return; + case ISD::OR: + if (SelectV_BFI(N)) + return; + + break; case ISD::BRCOND: SelectBRCOND(N); return; @@ -2146,6 +2151,116 @@ SelectCode(N); } +// Try matching a 32-bit expression +// (X_1 & C_1) | (X_2 & C_2) | ... | (X_n & C_n) +// with the constants C_i. If matching succeeds, convert +// it to a sequence of nested V_BFI instructions, if possible. +// Creating the canonical bitfieldInsert form in the frontend +// leads to simplification of the used bitmasks through SimplifyDemandedBits, +// where one bitfieldInsert is the base for another bitfieldInsert. +// Originally, the intended expression looks like: +// (X1 & C1) | (~C1 & ((X2 & C2) | (X3 & ~C3))) +// However, after SimplifyDemandedBits ISel, the expression will look like: +// (X1 & C1) | ((X2 & C2) | (X3 & (~C1 | ~C3))) +bool AMDGPUDAGToDAGISel::SelectV_BFI(SDNode *N) { + struct BFIOperandData { + SDNode *X; + uint32_t CVal; + }; + + SmallVector BFIOps; + uint32_t Disjunction = 0; + + // Check if this is an AND with a constant operand. + // Check if the mask from the constant operand fulfills several + // conditions and update the total mask as well as saving the AND + // node as candidate. + auto CreateBFIOpFromAnd = [&](SDNode *And) -> bool { + if (And->getOpcode() != ISD::AND) + return false; + + ConstantSDNode *Base = dyn_cast(And->getOperand(0)); + if (!Base) + Base = dyn_cast (And->getOperand(1)); + if (!Base) + return false; + + const uint32_t Mask = Base->getZExtValue(); + + // The masks need to be disjoint + if ((Disjunction & Mask) != 0) + return false; + + Disjunction |= Mask; + BFIOps.push_back({And, Mask}); + return true; + }; + + // We traverse through the OR operands of OR (OR, AND) instructions + // here, until a OR (AND, AND) instruction is reached. During that + // process, the constants masks from the AND instructions are gathered. + SDNode *CurrentOr = N; + while (true) { + SDValue O0 = CurrentOr->getOperand(0); + SDValue O1 = CurrentOr->getOperand(1); + // All investigated OR and AND instructions except the outer + // one will be optimized away on success. They are not allowed + // to be used more than once (which is, in the BFI pattern). + if (!O0.hasOneUse() || !O1.hasOneUse()) + return false; + + // One of the operands is required to be an OR instruction. + SDNode *NewOr = nullptr; + SDNode *Other = nullptr; + if (O0.getOpcode() == ISD::OR) { + NewOr = O0.getNode(); + Other = O1.getNode(); + } else if (O1.getOpcode() == ISD::OR) { + NewOr = O1.getNode(); + Other = O0.getNode(); + } + + // Try inserting the AND operand as candidate. + // Traverse down further. + if (NewOr && Other && CreateBFIOpFromAnd(Other)) { + CurrentOr = NewOr; + continue; + } + + // If we visited at least one level of instructions, check + // if we reached an OR with two AND operands. + // Try inserting both operands, if successful, we reached the + // (X_i & C_i) | (X_j & C_j) part. Abort the loop. + if (CurrentOr != N && CreateBFIOpFromAnd(O0.getNode()) && + CreateBFIOpFromAnd(O1.getNode())) { + break; + } + + return false; + } + + // Require that at least two nesting levels have been visited. + // The constants are required to be a partition of -1. + if (CurrentOr == N || Disjunction != 0xFFFFFFFF) + return false; + + // Transform the operands node by node and replace the original node. + // Assume the last node is the base for BFI insertion. + SDNode *NewNode = BFIOps.back().X->getOperand(0).getNode(); + + assert(NewNode && "Cannot use non-existing node as base!"); + for (auto &BFITuple : reverse(makeArrayRef(BFIOps).drop_back())) { + NewNode = CurDAG->getMachineNode( + AMDGPU::V_BFI_B32_e64, SDLoc(N), N->getValueType(0), + {BFITuple.X->getOperand(1), BFITuple.X->getOperand(0), + SDValue(NewNode, 0)}); + } + + ReplaceNode(N, NewNode); + + return true; +} + void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { switch (N->getOpcode()) { case ISD::AND: diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -15,17 +15,15 @@ ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xc00003ff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] .entry: %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 @@ -44,6 +42,38 @@ ret float %result } +define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) { +; GFX10-LABEL: v_bfi_single_nesting_level_swapped_operands: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x447fc000, v2 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +.entry: + %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03 + %mul.base.i32 = fptoui float %mul.base to i32 + %y.i32 = fptoui float %y to i32 + %shl.inner.insert = shl i32 %y.i32, 10 + %bfi1.and = and i32 1047552, %shl.inner.insert + %bfi1.andnot = and i32 -1073740801, %mul.base.i32 + %bfi1.or = or i32 %bfi1.and, %bfi1.andnot + %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03 + %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32 + %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20 + %and.outer = and i32 %shl.outer.insert, 1072693248 + %or.outer = or i32 %and.outer, %bfi1.or + %result = bitcast i32 %or.outer to float + ret float %result +} + define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) { ; GFX10-LABEL: v_bfi_single_nesting_level_inner_use: ; GFX10: ; %bb.0: ; %.entry @@ -114,18 +144,16 @@ ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x447fc000, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 5, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xc000001f, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_bfi_b32 v2, 0x3e0, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GFX10-NEXT: v_and_or_b32 v2, 0x3e0, v3, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffc00, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] .entry: %y.i32 = fptoui float %y to i32