Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -845,8 +845,8 @@ // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); for (MVT VT : MVT::vector_valuetypes()) { - setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); @@ -8018,6 +8018,72 @@ return !CI.second.getNode() ? DAG.getRoot() : CI.first; } +static SDValue LowerRotate(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert(VT.isVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::ROTR/ROTL"); + SDValue Value = Op.getOperand(0); + SDValue Amount = Op.getOperand(1); + EVT ShiftType = Amount.getValueType(); + bool Left = Op.getOpcode() == ISD::ROTL; + SDLoc DL(Op); + + unsigned Bits = VT.getScalarSizeInBits(); + KnownBits ShiftKnown = DAG.computeKnownBits(Amount); + + if (!ShiftKnown.isConstant()) { + // Normal shift-shift-or, we don't have a constant. + SDValue Temporary = DAG.getNode(Left ? ISD::SHL : ISD::SRL, DL, VT, Value, Amount); + Amount = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Bits, DL, ShiftType), Amount); + Value = DAG.getNode(Left ? ISD::SHL : ISD::SRL, DL, VT, Value, Amount); + return DAG.getNode(ISD::OR, DL, VT, Value, Temporary); + } + + const uint64_t Shift = ShiftKnown.getConstant().getLimitedValue(129); + if (Shift == 0) { + // nothing to do. + return Op; + } else if (Bits >= 16 && Shift * 2 == Bits) { + // Rotating N/2 bits is very easy with the VREV instructions. + EVT NewVT; + unsigned Opcode; + switch (Bits) { + case 16: + NewVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; + Opcode = ARMISD::VREV16; + break; + case 32: + NewVT = VT.is128BitVector() ? MVT::v8i16 : MVT::v4i16; + Opcode = ARMISD::VREV32; + break; + case 64: + NewVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + Opcode = ARMISD::VREV64; + break; + default: // don't know how we got here... + goto normal; + } + // Cast to the smaller vector + SDValue Reversed = DAG.getBitcast(NewVT, Value); + // vrevN.N/2 + Reversed = DAG.getNode(Opcode, DL, NewVT, Reversed); + // cast back + return DAG.getBitcast(VT, Reversed); + } else if (Shift >= Bits) { + // illegal + return SDValue(); + } +normal: + // vshl.iN q1, q0, #X + // vsri.N q1, q0, #(32 - x) + SDValue Forward = DAG.getConstant(Shift, DL, MVT::i32); + SDValue Reverse = DAG.getConstant(Bits - Shift, DL, MVT::i32); + + SDValue Temporary = DAG.getNode(Left ? ARMISD::VSHL : ARMISD::VSHRu, DL, VT, Value, Forward); + SDValue Rotated = DAG.getNode(Left ? ARMISD::VSRI : ARMISD::VSLI, DL, VT, Temporary, Value, Reverse); + return Rotated; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -8098,6 +8164,9 @@ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::ROTR: + case ISD::ROTL: + return LowerRotate(Op, DAG); } } Index: test/CodeGen/ARM/rotate.ll =================================================================== --- test/CodeGen/ARM/rotate.ll +++ test/CodeGen/ARM/rotate.ll @@ -1,14 +1,103 @@ ; RUN: llc < %s -mtriple=thumbv8--linux-gnueabihf | FileCheck %s -;; This used to cause a backend crash about not being able to -;; select ROTL. Make sure if generates the basic VSHL/VSHR. -define <2 x i64> @testcase(<2 x i64>* %in) { -; CHECK-LABEL: testcase +; We want vshl/vsri. +define <2 x i64> @vrolq_n_u64_8(<2 x i64>* %in) { +; CHECK-LABEL: vrolq_n_u64_8 +; CHECK-NOT: vrev64.32 ; CHECK: vshl.i64 -; CHECK: vshr.u64 +; CHECK: vsri.64 +; CHECK-NOT: vorr %1 = load <2 x i64>, <2 x i64>* %in - %2 = lshr <2 x i64> %1, - %3 = shl <2 x i64> %1, + %2 = lshr <2 x i64> %1, + %3 = shl <2 x i64> %1, %4 = or <2 x i64> %2, %3 ret <2 x i64> %4 } + +define <4 x i32> @vrolq_n_u32_13(<4 x i32>* %in) { +; CHECK-LABEL: vrolq_n_u32_13 +; CHECK-NOT: vrev32.16 +; CHECK: vshl.i32 +; CHECK: vsri.32 +; CHECK-NOT: vshr.u32 +; CHECK-NOT: vsli.32 +; CHECK-NOT: vorr + %1 = load <4 x i32>, <4 x i32>* %in + %2 = lshr <4 x i32> %1, + %3 = shl <4 x i32> %1, + %4 = or <4 x i32> %2, %3 + ret <4 x i32> %4 +} +define <8 x i16> @vrolq_n_u16_3(<8 x i16>* %in) { +; CHECK-LABEL: vrolq_n_u16_3 +; CHECK-NOT: vrev16.8 +; CHECK: vshl.i16 +; CHECK: vsri.16 +; CHECK-NOT: vorr + %1 = load <8 x i16>, <8 x i16>* %in + %2 = lshr <8 x i16> %1, + %3 = shl <8 x i16> %1, + %4 = or <8 x i16> %2, %3 + ret <8 x i16> %4 +} + +; We want vrev64 when we are rotating by /2 bits. +define <2 x i64> @vrolq_n_u64_32(<2 x i64>* %in) { +; CHECK-LABEL: vrolq_n_u64_32 +; CHECK: vrev64.32 +; CHECK-NOT: vorr +; CHECK-NOT: vsli.64 +; CHECK-NOT: vsri.64 +; CHECK-NOT: vshl.i64 +; CHECK-NOT: vshr.u64 + %1 = load <2 x i64>, <2 x i64>* %in + %2 = lshr <2 x i64> %1, + %3 = shl <2 x i64> %1, + %4 = or <2 x i64> %2, %3 + ret <2 x i64> %4 +} +define <4 x i32> @vrolq_n_u32_16(<4 x i32>* %in) { +; CHECK-LABEL: vrolq_n_u32_16 +; CHECK: vrev32.16 +; CHECK-NOT: vorr +; CHECK-NOT: vsli.32 +; CHECK-NOT: vsri.32 +; CHECK-NOT: vshl.i32 +; CHECK-NOT: vshr.u32 + %1 = load <4 x i32>, <4 x i32>* %in + %2 = lshr <4 x i32> %1, + %3 = shl <4 x i32> %1, + %4 = or <4 x i32> %2, %3 + ret <4 x i32> %4 +} +define <8 x i16> @vrolq_n_u16_8(<8 x i16>* %in) { +; CHECK-LABEL: vrolq_n_u16_8 +; CHECK: vrev16.8 +; CHECK-NOT: vorr +; CHECK-NOT: vsli.16 +; CHECK-NOT: vsri.16 +; CHECK-NOT: vshl.i16 +; CHECK-NOT: vshr.u16 + %1 = load <8 x i16>, <8 x i16>* %in + %2 = lshr <8 x i16> %1, + %3 = shl <8 x i16> %1, + %4 = or <8 x i16> %2, %3 + ret <8 x i16> %4 +} + +; No such thing as VREV8. +define <16 x i8> @vrolq_n_u8_4(<16 x i8>* %in) { +; CHECK-LABEL: vrolq_n_u8_4 +; CHECK-NOT: vrev +; CHECK-NOT: vorr +; CHECK: vshl.i8 +; CHECK: vsri.8 +; CHECK-NOT: vsri.8 +; CHECK-NOT: vshl.i8 +; CHECK-NOT: vshr.u8 + %1 = load <16 x i8>, <16 x i8>* %in + %2 = lshr <16 x i8> %1, + %3 = shl <16 x i8> %1, + %4 = or <16 x i8> %2, %3 + ret <16 x i8> %4 +}