diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -35,6 +35,7 @@ SRA_W, SRL_W, + BSTRINS, BSTRPICK, }; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; @@ -92,6 +93,7 @@ setMinFunctionAlignment(FunctionAlignment); setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::SRL); } @@ -406,6 +408,224 @@ return SDValue(); } +static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + MVT GRLenVT = Subtarget.getGRLenVT(); + EVT ValTy = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *CN0, *CN1; + SDLoc DL(N); + unsigned ValBits = ValTy.getSizeInBits(); + unsigned MaskIdx0, MaskLen0, MaskIdx1, MaskLen1; + unsigned Shamt; + bool SwapAndRetried = false; + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (ValBits != 32 && ValBits != 64) + return SDValue(); + +Retry: + // 1st pattern to match BSTRINS: + // R = or (and X, mask0), (and (shl Y, lsb), mask1) + // where mask1 = (2**size - 1) << lsb, mask0 = ~mask1 + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL && + (CN1 = dyn_cast(N1.getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskIdx0 == MaskIdx1 && MaskLen0 == MaskLen1 && + (CN1 = dyn_cast(N1.getOperand(0).getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 1\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 2nd pattern to match BSTRINS: + // R = or (and X, mask0), (shl (and Y, mask1), lsb) + // where mask1 = (2**size - 1), mask0 = ~(mask1 << lsb) + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CN1 = dyn_cast(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (CN1 = dyn_cast(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskLen0 == MaskLen1 && MaskIdx1 == 0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 2\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 3rd pattern to match BSTRINS: + // R = or (and X, mask0), (and Y, mask1) + // where ~mask0 = (2**size - 1) << lsb, mask0 & mask1 = 0 + // => + // R = BSTRINS X, (shr (and Y, mask1), lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && + (CN0 = dyn_cast(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (MaskIdx0 + MaskLen0 <= 64) && + (CN1 = dyn_cast(N1->getOperand(1))) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 3\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), N1, + DAG.getConstant(MaskIdx0, DL, GRLenVT)), + DAG.getConstant(ValBits == 32 + ? (MaskIdx0 + (MaskLen0 & 31) - 1) + : (MaskIdx0 + MaskLen0 - 1), + DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 4th pattern to match BSTRINS: + // R = or (and X, mask), (shl Y, shamt) + // where mask = (2**shamt - 1) + // => + // R = BSTRINS X, Y, ValBits - 1, shamt + // where ValBits = 32 or 64 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::SHL && + (CN0 = dyn_cast(N0.getOperand(1))) && + isShiftedMask_64(CN0->getZExtValue(), MaskIdx0, MaskLen0) && + MaskIdx0 == 0 && (CN1 = dyn_cast(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskLen0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 4\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant((ValBits - 1), DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + + // 5th pattern to match BSTRINS: + // R = or (and X, mask), const + // where ~mask = (2**size - 1) << lsb, mask & const = 0 + // => + // R = BSTRINS X, (const >> lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (CN1 = dyn_cast(N1)) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 5\n"); + return DAG.getNode( + LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 6th pattern. + // a = b | ((c & mask) << shamt), where all positions in b to be overwritten + // by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, shamt + MaskLen - 1, shamt + // + // Note that the 1st pattern is a special situation of the 6th, i.e. the 6th + // pattern is more common than the 1st. So we put the 1st before the 6th in + // order to match as many nodes as possible. + ConstantSDNode *CNMask, *CNShamt; + unsigned MaskIdx, MaskLen; + if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CNMask = dyn_cast(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + MaskIdx == 0 && (CNShamt = dyn_cast(N1.getOperand(1))) && + CNShamt->getZExtValue() + MaskLen <= ValBits) { + Shamt = CNShamt->getZExtValue(); + APInt ShMask(ValBits, CNMask->getZExtValue() << Shamt); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 6\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(Shamt + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + } + + // 7th pattern. + // a = b | ((c << shamt) & shifted_mask), where all positions in b to be + // overwritten by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 7th pattern is more common than the 2nd. So we put the 2nd + // before the 7th in order to match as many nodes as possible. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + N1.getOperand(0).getOpcode() == ISD::SHL && + (CNShamt = dyn_cast(N1.getOperand(0).getOperand(1))) && + CNShamt->getZExtValue() == MaskIdx) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 7\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + + // (or a, b) and (or b, a) are equivalent, so swap the operands and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry; + } + + SwapAndRetried = false; +Retry2: + // 8th pattern. + // a = b | (c & shifted_mask), where all positions in b to be overwritten by + // the incoming bits are known to be zero. + // => + // a = BSTRINS b, c >> MaskIdx, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 8th pattern is more common than the 4th and 5th patterns. So + // we put it here in order to match as many nodes as possible or generate less + // instructions. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen)) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 8\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), + N1->getOperand(0), + DAG.getConstant(MaskIdx, DL, GRLenVT)), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + // Swap N0/N1 and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry2; + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -414,6 +634,8 @@ break; case ISD::AND: return performANDCombine(N, DAG, DCI, Subtarget); + case ISD::OR: + return performORCombine(N, DAG, DCI, Subtarget); case ISD::SRL: return performSRLCombine(N, DAG, DCI, Subtarget); } @@ -483,6 +705,7 @@ NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) + NODE_NAME_CASE(BSTRINS) NODE_NAME_CASE(BSTRPICK) } #undef NODE_NAME_CASE diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -26,6 +26,11 @@ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64> ]>; +def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [ + SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, + SDTCisSameAs<3, 4> +]>; + def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3> ]>; @@ -46,6 +51,8 @@ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; +def loongarch_bstrins + : SDNode<"LoongArchISD::BSTRINS", SDT_LoongArchBStrIns>; def loongarch_bstrpick : SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>; @@ -774,15 +781,21 @@ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, PseudoInstExpansion<(JIRL R0, R1, 0)>; -/// BSTRPICK +/// BSTRINS and BSTRPICK -let Predicates = [IsLA32] in +let Predicates = [IsLA32] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd), + (BSTRINS_W GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd), (BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; +} // Predicates = [IsLA32] -let Predicates = [IsLA64] in +let Predicates = [IsLA64] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd), + (BSTRINS_D GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd), (BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; +} // Predicates = [IsLA64] /// Loads diff --git a/llvm/test/CodeGen/LoongArch/bstrins_d.ll b/llvm/test/CodeGen/LoongArch/bstrins_d.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bstrins_d.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s + +;; Test generation of the bstrins.d instruction. +;; There are 8 patterns that can be matched to bstrins.d. See performORCombine +;; for details. + +;; Pattern 1 +;; R = or (and X, mask0), (and (shl Y, lsb), mask1) +;; => +;; R = BSTRINS X, Y, msb, lsb +define i64 @pat1(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat1: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 39, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff + %shl = shl i64 %b, 16 + %and2 = and i64 %shl, 1099511562240 ; 0x000000ffffff0000 + %or = or i64 %and1, %and2 + ret i64 %or +} + +define i64 @pat1_swap(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat1_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 39, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff + %shl = shl i64 %b, 16 + %and2 = and i64 %shl, 1099511562240 ; 0x000000ffffff0000 + %or = or i64 %and2, %and1 + ret i64 %or +} + +;; Pattern 2 +;; R = or (and X, mask0), (shl (and Y, mask1), lsb) +;; => +;; R = BSTRINS X, Y, msb, lsb +define i64 @pat2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat2: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 39, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff + %and2 = and i64 %b, 16777215 ; 0x0000000000ffffff + %shl = shl i64 %and2, 16 + %or = or i64 %and1, %shl + ret i64 %or +} + +define i64 @pat2_swap(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat2_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 39, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff + %and2 = and i64 %b, 16777215 ; 0x0000000000ffffff + %shl = shl i64 %and2, 16 + %or = or i64 %shl, %and1 + ret i64 %or +} + +;; Pattern 3 +;; R = or (and X, mask0), (and Y, mask1) +;; => +;; R = BSTRINS X, (srl (and Y, mask1), lsb), msb, lsb +define i64 @pat3(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat3: +; CHECK: # %bb.0: +; CHECK-NEXT: andi $a1, $a1, 288 +; CHECK-NEXT: srli.d $a1, $a1, 4 +; CHECK-NEXT: bstrins.d $a0, $a1, 11, 4 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -4081 ; 0xfffffffffffff00f + %and2 = and i64 %b, 288 ; 0x0000000000000120 + %or = or i64 %and1, %and2 + ret i64 %or +} + +define i64 @pat3_swap(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat3_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: andi $a1, $a1, 288 +; CHECK-NEXT: srli.d $a1, $a1, 4 +; CHECK-NEXT: bstrins.d $a0, $a1, 11, 4 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i64 %a, -4081 ; 0xfffffffffffff00f + %and2 = and i64 %b, 288 ; 0x0000000000000120 + %or = or i64 %and2, %and1 + ret i64 %or +} + +;; Pattern 4 +;; R = or (and X, mask), (shl Y, shamt) +;; => +;; R = BSTRINS X, Y, 63, shamt +define i64 @pat4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat4: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 63, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %a, 255 + %shl = shl i64 %b, 8 + %or = or i64 %and, %shl + ret i64 %or +} + +define i64 @pat4_swap(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: pat4_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.d $a0, $a1, 63, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %a, 255 + %shl = shl i64 %b, 8 + %or = or i64 %shl, %and + ret i64 %or +} + +;; Pattern 5 +;; R = or (and X, mask0), const +;; => +;; R = BSTRINS X, (const >> lsb), msb, lsb +define i64 @pat5(i64 %a) nounwind { +; CHECK-LABEL: pat5: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 74565 +; CHECK-NEXT: ori $a1, $a1, 1656 +; CHECK-NEXT: bstrins.d $a0, $a1, 47, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %a, 18446462598732906495 ; 0xffff00000000ffff + %or = or i64 %and, 20015998304256 ; 0x0000123456780000 + ret i64 %or +} + +;; Pattern 6: a = b | ((c & mask) << shamt) +;; In this testcase b is 0x123456000000789a, but in fact we do not require b +;; being a constant. As long as all positions in b to be overwritten by the +;; incoming bits are known to be zero, the pattern could be matched. +define i64 @pat6(i64 %c) nounwind { +; CHECK-LABEL: pat6: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 7 +; CHECK-NEXT: ori $a1, $a1, 2202 +; CHECK-NEXT: lu32i.d $a1, 284160 +; CHECK-NEXT: lu52i.d $a1, $a1, 291 +; CHECK-NEXT: bstrins.d $a1, $a0, 39, 16 +; CHECK-NEXT: move $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %c, 16777215 ; 0x0000000000ffffff + %shl = shl i64 %and, 16 + %or = or i64 %shl, 1311767949471676570 ; 0x123456000000789a + ret i64 %or +} + +;; Pattern 7: a = b | ((c << shamt) & shifted_mask) +;; Similar to pattern 6. +define i64 @pat7(i64 %c) nounwind { +; CHECK-LABEL: pat7: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 7 +; CHECK-NEXT: ori $a1, $a1, 2202 +; CHECK-NEXT: lu32i.d $a1, 284160 +; CHECK-NEXT: lu52i.d $a1, $a1, 291 +; CHECK-NEXT: bstrins.d $a1, $a0, 39, 16 +; CHECK-NEXT: move $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %shl = shl i64 %c, 16 + %and = and i64 %shl, 1099511562240 ; 0x000000ffffff0000 + %or = or i64 %and, 1311767949471676570 ; 0x123456000000789a + ret i64 %or +} + +;; Pattern 8: a = b | (c & shifted_mask) +;; Similar to pattern 7 but without shift to c. +define i64 @pat8(i64 %c) nounwind { +; CHECK-LABEL: pat8: +; CHECK: # %bb.0: +; CHECK-NEXT: srli.d $a1, $a0, 16 +; CHECK-NEXT: lu12i.w $a0, 7 +; CHECK-NEXT: ori $a0, $a0, 2202 +; CHECK-NEXT: lu32i.d $a0, 284160 +; CHECK-NEXT: lu52i.d $a0, $a0, 291 +; CHECK-NEXT: bstrins.d $a0, $a1, 39, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %c, 1099511562240 ; 0x000000ffffff0000 + %or = or i64 %and, 1311767949471676570 ; 0x123456000000789a + ret i64 %or +} + +;; Test that bstrins.d is not generated because constant OR operand +;; doesn't fit into bits cleared by constant AND operand. +define i64 @no_bstrins_d(i64 %a) nounwind { +; CHECK-LABEL: no_bstrins_d: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 354185 +; CHECK-NEXT: lu32i.d $a1, 4660 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: lu12i.w $a1, 354191 +; CHECK-NEXT: ori $a1, $a1, 4095 +; CHECK-NEXT: lu32i.d $a1, -60876 +; CHECK-NEXT: and $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i64 %a, 18446462598732906495 ; 0xffff00000000ffff + %or = or i64 %and, 20015998341120 ; 0x0000123456789000 + ret i64 %or +} diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s + +;; Test generation of the bstrins.w instruction. +;; There are 8 patterns that can be matched to bstrins.w. See performORCombine +;; for details. + +;; Pattern 1 +;; R = or (and X, mask0), (and (shl Y, lsb), mask1) +;; => +;; R = BSTRINS X, Y, msb, lsb +define i32 @pat1(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat1: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 19, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -1048321 ; 0xfff000ff + %shl = shl i32 %b, 8 + %and2 = and i32 %shl, 1048320 ; 0x000fff00 + %or = or i32 %and1, %and2 + ret i32 %or +} + +define i32 @pat1_swap(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat1_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 19, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -1048321 ; 0xfff000ff + %shl = shl i32 %b, 8 + %and2 = and i32 %shl, 1048320 ; 0x000fff00 + %or = or i32 %and2, %and1 + ret i32 %or +} + +;; Pattern 2 +;; R = or (and X, mask0), (shl (and Y, mask1), lsb) +;; => +;; R = BSTRINS X, Y, msb, lsb +define i32 @pat2(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat2: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 19, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -1048321 ; 0xfff000ff + %and2 = and i32 %b, 4095 ; 0x00000fff + %shl = shl i32 %and2, 8 + %or = or i32 %and1, %shl + ret i32 %or +} + +define i32 @pat2_swap(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat2_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 19, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -1048321 ; 0xfff000ff + %and2 = and i32 %b, 4095 ; 0x00000fff + %shl = shl i32 %and2, 8 + %or = or i32 %shl, %and1 + ret i32 %or +} + +;; Pattern 3 +;; R = or (and X, mask0), (and Y, mask1) +;; => +;; R = BSTRINS X, (srl (and Y, mask1), lsb), msb, lsb +define i32 @pat3(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat3: +; CHECK: # %bb.0: +; CHECK-NEXT: andi $a1, $a1, 288 +; CHECK-NEXT: srli.w $a1, $a1, 4 +; CHECK-NEXT: bstrins.w $a0, $a1, 11, 4 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -4081 ; 0xfffff00f + %and2 = and i32 %b, 288 ; 0x00000120 + %or = or i32 %and1, %and2 + ret i32 %or +} + +define i32 @pat3_swap(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat3_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: andi $a1, $a1, 288 +; CHECK-NEXT: srli.w $a1, $a1, 4 +; CHECK-NEXT: bstrins.w $a0, $a1, 11, 4 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, -4081 ; 0xfffff00f + %and2 = and i32 %b, 288 ; 0x00000120 + %or = or i32 %and2, %and1 + ret i32 %or +} + +define i32 @pat3_positive_mask0(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat3_positive_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: srli.w $a1, $a1, 28 +; CHECK-NEXT: bstrins.w $a0, $a1, 31, 28 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and1 = and i32 %a, 268435455 ; 0x0fffffff + %and2 = and i32 %b, 4026531840 ; 0xf0000000 + %or = or i32 %and1, %and2 + ret i32 %or +} + +;; Pattern 4 +;; R = or (and X, mask), (shl Y, shamt) +;; => +;; R = BSTRINS X, Y, 31, shamt +define i32 @pat4(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat4: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 31, 28 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %a, 268435455 ; 0x0fffffff + %shl = shl i32 %b, 28 + %or = or i32 %and, %shl + ret i32 %or +} + +define i32 @pat4_swap(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: pat4_swap: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrins.w $a0, $a1, 31, 28 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %a, 268435455 ; 0x0fffffff + %shl = shl i32 %b, 28 + %or = or i32 %shl, %and + ret i32 %or +} + +;; Pattern 5 +;; R = or (and X, mask), const +;; => +;; R = BSTRINS X, (const >> lsb), msb, lsb +define i32 @pat5(i32 %a) nounwind { +; CHECK-LABEL: pat5: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 1 +; CHECK-NEXT: ori $a1, $a1, 564 +; CHECK-NEXT: bstrins.w $a0, $a1, 23, 8 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %a, 4278190335 ; 0xff0000ff + %or = or i32 %and, 1192960 ; 0x00123400 + ret i32 %or +} + +;; Pattern 6: a = b | ((c & mask) << shamt) +;; In this testcase b is 0x10000002, but in fact we do not require b being a +;; constant. As long as all positions in b to be overwritten by the incoming +;; bits are known to be zero, the pattern could be matched. +define i32 @pat6(i32 %c) nounwind { +; CHECK-LABEL: pat6: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 65536 +; CHECK-NEXT: ori $a1, $a1, 2 +; CHECK-NEXT: bstrins.w $a1, $a0, 27, 4 +; CHECK-NEXT: move $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %c, 16777215 ; 0x00ffffff + %shl = shl i32 %and, 4 + %or = or i32 %shl, 268435458 ; 0x10000002 + ret i32 %or +} + +;; Pattern 7: a = b | ((c << shamt) & shifted_mask) +;; Similar to pattern 6. +define i32 @pat7(i32 %c) nounwind { +; CHECK-LABEL: pat7: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 65536 +; CHECK-NEXT: ori $a1, $a1, 2 +; CHECK-NEXT: bstrins.w $a1, $a0, 27, 4 +; CHECK-NEXT: move $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %shl = shl i32 %c, 4 + %and = and i32 %shl, 268435440 ; 0x0ffffff0 + %or = or i32 %and, 268435458 ; 0x10000002 + ret i32 %or +} + +;; Pattern 8: a = b | (c & shifted_mask) +;; Similar to pattern 7 but without shift to c. +define i32 @pat8(i32 %c) nounwind { +; CHECK-LABEL: pat8: +; CHECK: # %bb.0: +; CHECK-NEXT: srli.w $a1, $a0, 4 +; CHECK-NEXT: lu12i.w $a0, 65536 +; CHECK-NEXT: ori $a0, $a0, 2 +; CHECK-NEXT: bstrins.w $a0, $a1, 27, 4 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %c, 268435440 ; 0x0ffffff0 + %or = or i32 %and, 268435458 ; 0x10000002 + ret i32 %or +} + +;; Test that bstrins.w is not generated because constant OR operand +;; doesn't fit into bits cleared by constant AND operand. +define i32 @no_bstrins_w(i32 %a) nounwind { +; CHECK-LABEL: no_bstrins_w: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 291 +; CHECK-NEXT: ori $a1, $a1, 1104 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: lu12i.w $a1, -3805 +; CHECK-NEXT: ori $a1, $a1, 1279 +; CHECK-NEXT: and $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %and = and i32 %a, 4278190335 ; 0xff0000ff + %or = or i32 %and, 1193040 ; 0x00123450 + ret i32 %or +}