Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -481,6 +481,7 @@ // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. @@ -7444,6 +7445,30 @@ return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); } +// (op shl(x, c1), shl(y, c2)) -> (op shl(y, c2), shl(x, c1)) if c1 < c2 +// It is especially useful for Kryo. +static SDValue CannonicalizeOperands(SDNode *N, SelectionDAG &DAG) { + unsigned Opc = N->getOpcode(); + assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR || + Opc == ISD::ADD) && + "Unexpected opcode"); + EVT VT = N->getValueType(0); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDLoc dl(N); + if (LHS.getOpcode() == ISD::SHL && RHS.getOpcode() == ISD::SHL && + VT.isInteger()) { + ConstantSDNode *C0 = dyn_cast(LHS.getOperand(1)); + ConstantSDNode *C1 = dyn_cast(RHS.getOperand(1)); + if (C0 && C1 && C0->getSExtValue() > 0 && + C1->getSExtValue() > C0->getSExtValue()) + return DAG.getNode(Opc, dl, VT, RHS, LHS); + } + + return SDValue(); +} + // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7469,7 +7494,7 @@ DAG.getConstant(AArch64CC::PL, DL, MVT::i32), SDValue(Cmp.getNode(), 1)); } - return SDValue(); + return CannonicalizeOperands(N, DAG); } static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, @@ -7915,6 +7940,15 @@ return SDValue(); } +static SDValue performANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + return CannonicalizeOperands(N, DAG); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) @@ -7932,7 +7966,7 @@ if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; - return SDValue(); + return CannonicalizeOperands(N, DAG); } static SDValue performBitcastCombine(SDNode *N, @@ -8286,7 +8320,7 @@ if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) - return SDValue(); + return CannonicalizeOperands(Op, DAG); } // FIXME: This could be generatized to work for FP comparisons. @@ -9825,6 +9859,8 @@ return performFpToIntCombine(N, DAG, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, Subtarget); + case ISD::AND: + return performANDCombine(N, DCI); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: Index: test/CodeGen/AArch64/kryo-lsl.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/kryo-lsl.ll @@ -0,0 +1,198 @@ +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s + +; Verify that the shift amount in the binary commutative instruction is smaller +; if applicable. + +define i32 @lsl_add1(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_add1: +; CHECK: lsl w8, w0, #4 +; CHECK-NEXT: add w0, w8, w1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 4 + %shl1 = shl i32 %b, 3 + %add = add i32 %shl, %shl1 + ret i32 %add +} + +define i32 @lsl_add2(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_add2: +; CHECK: lsl w8, w1, #4 +; CHECK-NEXT: add w0, w8, w0, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 3 + %shl1 = shl i32 %b, 4 + %add = add i32 %shl, %shl1 + ret i32 %add +} + +define i64 @lsl_add3(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_add3: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: add x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %add = add i64 %shl, %shl1 + ret i64 %add +} + +define i64 @lsl_add4(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_add4: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: add x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %add = add i64 %shl, %shl1 + ret i64 %add +} + +define i32 @lsl_and1(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_and1: +; CHECK: lsl w8, w0, #4 +; CHECK-NEXT: and w0, w8, w1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 4 + %shl1 = shl i32 %b, 3 + %and = and i32 %shl, %shl1 + ret i32 %and +} + +define i32 @lsl_and2(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_and2: +; CHECK: lsl w8, w1, #4 +; CHECK-NEXT: and w0, w8, w0, lsl #3 +; CHECK-NEXT: ret + +entry: + %shl = shl i32 %a, 3 + %shl1 = shl i32 %b, 4 + %and = and i32 %shl, %shl1 + ret i32 %and +} + +define i64 @lsl_and3(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_and3: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: and x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %and = and i64 %shl, %shl1 + ret i64 %and +} + +define i64 @lsl_and4(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_and4: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: and x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %and = and i64 %shl, %shl1 + ret i64 %and +} + +define i32 @lsl_or1(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_or1: +; CHECK: lsl w8, w0, #4 +; CHECK-NEXT: orr w0, w8, w1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 4 + %shl1 = shl i32 %b, 3 + %or = or i32 %shl, %shl1 + ret i32 %or +} + +define i32 @lsl_or2(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_or2: +; CHECK: lsl w8, w1, #4 +; CHECK-NEXT: orr w0, w8, w0, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 3 + %shl1 = shl i32 %b, 4 + %or = or i32 %shl, %shl1 + ret i32 %or +} + +define i64 @lsl_or3(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_or3: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: orr x0, x8, x1, lsl #3 +; CHECK-NEXT: ret + +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %or = or i64 %shl, %shl1 + ret i64 %or +} + +define i64 @lsl_or4(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_or4: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: orr x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %or = or i64 %shl, %shl1 + ret i64 %or +} + +define i32 @lsl_xor1(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_xor1: +; CHECK: lsl w8, w0, #4 +; CHECK-NEXT: eor w0, w8, w1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 4 + %shl1 = shl i32 %b, 3 + %xor = xor i32 %shl, %shl1 + ret i32 %xor +} + +define i32 @lsl_xor2(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_xor2: +; CHECK: lsl w8, w1, #4 +; CHECK-NEXT: eor w0, w8, w0, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 3 + %shl1 = shl i32 %b, 4 + %xor = xor i32 %shl, %shl1 + ret i32 %xor +} + +define i64 @lsl_xor3(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_xor3: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: eor x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %xor = xor i64 %shl, %shl1 + ret i64 %xor +} + +define i64 @lsl_xor4(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_xor4: +; CHECK: lsl x8, x0, #4 +; CHECK-NEXT: eor x0, x8, x1, lsl #3 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 4 + %shl1 = shl i64 %b, 3 + %xor = xor i64 %shl, %shl1 + ret i64 %xor +}