Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -440,6 +441,7 @@ SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); + SDNode *ReassociateOrForRotate(SDValue Op0, SDValue Op1, const SDLoc &dl); SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, @@ -4890,6 +4892,11 @@ // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); + // If N0 or N1 are themselves ORs, there is still a potential for a rotate + // whose parts got reassociated with some other stuff. + if (N0.getOpcode() == ISD::OR || N1.getOpcode() == ISD::OR) + if (SDNode *Or = ReassociateOrForRotate(N0, N1, SDLoc(N))) + return SDValue(Or, 0); if (SDValue Load = MatchLoadCombine(N)) return Load; @@ -4901,6 +4908,94 @@ return SDValue(); } +SDNode *DAGCombiner::ReassociateOrForRotate(SDValue Op0, SDValue Op1, + const SDLoc &dl) { + EVT VT = Op0.getValueType(); + if (!hasOperation(ISD::ROTL, VT) && !hasOperation(ISD::ROTR, VT)) + return nullptr; + + // Expand all single-use ORs into a list (OredOps). + SmallVector OredOps; + std::queue WorkQ; + WorkQ.push(Op0); + WorkQ.push(Op1); + + while (!WorkQ.empty()) { + SDValue V = WorkQ.front(); + WorkQ.pop(); + if (V.getOpcode() == ISD::OR && V.hasOneUse()) { + WorkQ.push(V.getOperand(0)); + WorkQ.push(V.getOperand(1)); + } else + OredOps.push_back(V); + } + + // Since only shifts of the same SDValue can end up paired up into a rotate, + // create separate lists of shifts for each shifted value. + DenseMap> OpMap; + for (unsigned I = 0, E = OredOps.size(); I != E; ++I) { + SDValue V = OredOps[I]; + unsigned Opc = V.getOpcode(); + if (Opc == ISD::SHL || Opc == ISD::SRL) + OpMap[V.getOperand(0)->getIROrder()].push_back(I); + } + + // Sort the shifts with respect to the opcodes. This is to group + // the SHL operations into one contiguous block and same for SRL. + auto OpcOrder = [&OredOps](unsigned I, unsigned J) { + return OredOps[I].getOpcode() < OredOps[J].getOpcode(); + }; + + bool CreatedRotate = false; + + for (auto P : OpMap) { + auto &Shifts = P.second; + assert(!Shifts.empty() && "OpMap should not have empty lists"); + llvm::sort(Shifts.begin(), Shifts.end(), OpcOrder); + // The list of shifts should only have SHL and SRL on it grouped into + // two contiguous segments. Find the beginning of the second segment. + auto Boundary = std::upper_bound(std::next(Shifts.begin()), Shifts.end(), + Shifts.front(), OpcOrder); + + for (unsigned I = 0, E = Boundary - Shifts.begin(); I != E; ++I) { + for (unsigned J = E, F = Shifts.size(); J != F; ++J) { + SDValue &OI = OredOps[Shifts[I]], &OJ = OredOps[Shifts[J]]; + if (!OJ) + continue; + if (SDNode *T = MatchRotate(OI, OJ, dl)) { + OredOps.push_back(SDValue(T, 0)); + OI = OJ = SDValue(); + CreatedRotate = true; + // When a rotate is created, stop the inner loop traversal, but + // continue with the outer loop so that more opportunities for + // rotates of the same value could be found. + break; + } + } + } + } + + // All pairs of left-right shifts have been examined. Now, re-package + // the values back into an OR tree. + if (!CreatedRotate) + return nullptr; + + auto OredEnd = remove_if(OredOps, [](SDValue V) { return !bool(V); }); + unsigned Size = OredEnd - OredOps.begin(); + while (Size != 1) { + for (unsigned i = 0; i != Size/2; ++i) + OredOps[i] = DAG.getNode(ISD::OR, dl, VT, OredOps[2*i], OredOps[2*i+1]); + if (Size % 2 != 0) { + OredOps[Size/2] = OredOps[Size-1]; + Size = Size/2 + 1; + } else + Size /= 2; + } + + // The last remaining op is the root. + return OredOps[0].getNode(); +} + /// Match "(X shl/srl V1) & V2" where V2 may not be present. bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { if (Op.getOpcode() == ISD::AND) { Index: test/CodeGen/Hexagon/rotate-multi.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/rotate-multi.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; OR of two rotates of %a0(r0). +; CHECK-LABEL: f0: +; CHECK: r[[R00:[0-9]+]] = rol(r0,#7) +; CHECK: r[[R00]] |= rol(r0,#9) +define i32 @f0(i32 %a0) #0 { +b0: + %v0 = shl i32 %a0, 7 + %v1 = lshr i32 %a0, 25 + %v2 = or i32 %v0, %v1 + %v3 = shl i32 %a0, 9 + %v4 = lshr i32 %a0, 23 + %v5 = or i32 %v3, %v4 + %v6 = or i32 %v2, %v5 + ret i32 %v6 +} + +; OR of two rotates of %a0(r0) with an extra input %a1(r1). +; CHECK-LABEL: f1: +; CHECK: r1 |= asl(r0,#7) +; CHECK: r1 |= rol(r0,#9) +define i32 @f1(i32 %a0, i32 %a1) #0 { +b0: + %v0 = shl i32 %a0, 7 + %v1 = lshr i32 %a0, 25 + %v2 = or i32 %v0, %a1 + %v3 = shl i32 %a0, 9 + %v4 = lshr i32 %a0, 23 + %v5 = or i32 %v3, %v4 + %v6 = or i32 %v2, %v5 + %v7 = or i32 %v6, %v1 + ret i32 %v6 +} + +; OR of two rotates of two different inputs: %a0(r0) and %a1(r1). +; CHECK-LABEL: f2: +; CHECK: r0 = rol(r0,#11) +; CHECK: r0 |= rol(r1,#19) +define i32 @f2(i32 %a0, i32 %a1) #0 { + %v0 = shl i32 %a0, 11 + %v1 = lshr i32 %a0, 21 + %v2 = shl i32 %a1, 19 + %v3 = lshr i32 %a1, 13 + %v4 = or i32 %v0, %v2 + %v5 = or i32 %v1, %v3 + %v6 = or i32 %v4, %v5 + ret i32 %v6 +} + +; ORs of multiple shifts of the same value with only one pair actually +; matching a rotate. +; CHECK-LABEL: f3: +; CHECK-NOT: rol +; CHECK: r[[R30:[0-9]+]] |= rol(r0,#7) +; CHECK-NOT: rol +define i32 @f3(i32 %a0) #0 { + %v0 = shl i32 %a0, 3 + %v1 = shl i32 %a0, 5 + %v2 = shl i32 %a0, 7 ; rotate + %v3 = shl i32 %a0, 13 + %v4 = shl i32 %a0, 19 + %v5 = lshr i32 %a0, 2 + %v6 = lshr i32 %a0, 15 + %v7 = lshr i32 %a0, 23 + %v8 = lshr i32 %a0, 25 ; rotate + %v9 = lshr i32 %a0, 30 + %v10 = or i32 %v0, %v1 + %v11 = or i32 %v10, %v2 + %v12 = or i32 %v11, %v3 + %v13 = or i32 %v12, %v4 + %v14 = or i32 %v13, %v5 + %v15 = or i32 %v14, %v6 + %v16 = or i32 %v15, %v7 + %v17 = or i32 %v16, %v8 + %v18 = or i32 %v17, %v9 + ret i32 %v18 +} + +attributes #0 = { readnone nounwind "target-cpu"="hexagonv60" "target-features"="-packets" } Index: test/CodeGen/Hexagon/rotate.ll =================================================================== --- test/CodeGen/Hexagon/rotate.ll +++ test/CodeGen/Hexagon/rotate.ll @@ -147,6 +147,17 @@ ret i32 %v3 } +; CHECK-LABEL: f11 +; CHECK: r0 |= rol(r1,#7) +define i32 @f11(i32 %a0, i32 %a1) #0 { +b0: + %v0 = shl i32 %a1, 7 + %v1 = lshr i32 %a1, 25 + %v2 = or i32 %v1, %a0 + %v3 = or i32 %v2, %v0 + ret i32 %v3 +} + ; CHECK-LABEL: f12 ; CHECK: r0 ^= rol(r1,#7) define i32 @f12(i32 %a0, i32 %a1) #0 { @@ -191,6 +202,17 @@ ret i64 %v3 } +; CHECK-LABEL: f16 +; CHECK: r1:0 |= rol(r3:2,#7) +define i64 @f16(i64 %a0, i64 %a1) #0 { +b0: + %v0 = shl i64 %a1, 7 + %v1 = lshr i64 %a1, 57 + %v2 = or i64 %v1, %a0 + %v3 = or i64 %v2, %v0 + ret i64 %v3 +} + ; CHECK-LABEL: f17 ; CHECK: r1:0 ^= rol(r3:2,#7) define i64 @f17(i64 %a0, i64 %a1) #0 { Index: test/CodeGen/X86/rotate-multi.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/rotate-multi.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s + +; OR of two rotates of %a0(edi). +define i32 @f0(i32 %a0) #0 { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: # %b0 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: roll $9, %eax +; CHECK-NEXT: roll $7, %edi +; CHECK-NEXT: orl %eax, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +b0: + %v0 = shl i32 %a0, 7 + %v1 = lshr i32 %a0, 25 + %v2 = or i32 %v0, %v1 + %v3 = shl i32 %a0, 9 + %v4 = lshr i32 %a0, 23 + %v5 = or i32 %v3, %v4 + %v6 = or i32 %v2, %v5 + ret i32 %v6 +} + +; OR of two rotates of %a0(edi) with an extra input %a1(esi). +define i32 @f1(i32 %a0, i32 %a1) #0 { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: # %b0 +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $7, %eax +; CHECK-NEXT: roll $9, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: orl %eax, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +b0: + %v0 = shl i32 %a0, 7 + %v1 = lshr i32 %a0, 25 + %v2 = or i32 %v0, %a1 + %v3 = shl i32 %a0, 9 + %v4 = lshr i32 %a0, 23 + %v5 = or i32 %v3, %v4 + %v6 = or i32 %v2, %v5 + %v7 = or i32 %v6, %v1 + ret i32 %v6 +} + +; OR of two rotates of two different inputs: %a0(edi) and %a1(esi). +define i32 @f2(i32 %a0, i32 %a1) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: roll $19, %esi +; CHECK-NEXT: roll $11, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %v0 = shl i32 %a0, 11 + %v1 = lshr i32 %a0, 21 + %v2 = shl i32 %a1, 19 + %v3 = lshr i32 %a1, 13 + %v4 = or i32 %v0, %v2 + %v5 = or i32 %v1, %v3 + %v6 = or i32 %v4, %v5 + ret i32 %v6 +} + +; ORs of multiple shifts of the same value with only one pair actually +; matching a rotate. +define i32 @f3(i32 %a0) #0 { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal (,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shll $5, %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $13, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shll $19, %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $2, %eax +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: shrl $15, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $23, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $30, %ecx +; CHECK-NEXT: roll $7, %edi +; CHECK-NEXT: orl %eax, %edi +; CHECK-NEXT: orl %ecx, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %v0 = shl i32 %a0, 3 + %v1 = shl i32 %a0, 5 + %v2 = shl i32 %a0, 7 ; rotate + %v3 = shl i32 %a0, 13 + %v4 = shl i32 %a0, 19 + %v5 = lshr i32 %a0, 2 + %v6 = lshr i32 %a0, 15 + %v7 = lshr i32 %a0, 23 + %v8 = lshr i32 %a0, 25 ; rotate + %v9 = lshr i32 %a0, 30 + %v10 = or i32 %v0, %v1 + %v11 = or i32 %v10, %v2 + %v12 = or i32 %v11, %v3 + %v13 = or i32 %v12, %v4 + %v14 = or i32 %v13, %v5 + %v15 = or i32 %v14, %v6 + %v16 = or i32 %v15, %v7 + %v17 = or i32 %v16, %v8 + %v18 = or i32 %v17, %v9 + ret i32 %v18 +} + +attributes #0 = { readnone nounwind }