diff --git a/llvm/lib/Target/ARC/ARCISelLowering.h b/llvm/lib/Target/ARC/ARCISelLowering.h
--- a/llvm/lib/Target/ARC/ARCISelLowering.h
+++ b/llvm/lib/Target/ARC/ARCISelLowering.h
@@ -77,6 +77,9 @@
 private:
   const ARCSubtarget &Subtarget;
 
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
   // Lower Operand helpers
   SDValue LowerCallArguments(SDValue Chain, CallingConv::ID CallConv,
                              bool isVarArg,
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -68,6 +68,31 @@
   }
 }
 
+void ARCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "[ARC-ISEL] ReplaceNodeResults ");
+  LLVM_DEBUG(N->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "; use_count=" << N->use_size() << "\n");
+
+  switch (N->getOpcode()) {
+  case ISD::READCYCLECOUNTER:
+    if (N->getValueType(0) == MVT::i64) {
+      // We read the TIMER0 and zero-extend it to 64-bits as the intrinsic
+      // requires.
+      SDValue V =
+          DAG.getNode(ISD::READCYCLECOUNTER, SDLoc(N),
+                      DAG.getVTList(MVT::i32, MVT::Other), N->getOperand(0));
+      SDValue Op = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i64, V);
+      Results.push_back(Op);
+      Results.push_back(V.getValue(1));
+    }
+    break;
+  default:
+    break;
+  }
+}
+
 ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
                                      const ARCSubtarget &Subtarget)
     : TargetLowering(TM), Subtarget(Subtarget) {
@@ -140,6 +165,10 @@
   //       when the HasBitScan predicate is available.
   setOperationAction(ISD::CTLZ, MVT::i32, Legal);
   setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i32, Legal);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
+                     isTypeLegal(MVT::i64) ? Legal : Custom);
 }
 
 const char *ARCTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -766,6 +795,13 @@
     return LowerJumpTable(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
+  case ISD::READCYCLECOUNTER:
+    // As of LLVM 3.8, the lowering code insists that we customize it even
+    // though we've declared the i32 version as legal. This is because it only
+    // thinks i64 is the truly supported version. We've already converted the
+    // i64 version to a widened i32.
+    assert(Op.getSimpleValueType() == MVT::i32);
+    return Op;
   default:
     llvm_unreachable("unimplemented operand");
   }
diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td
--- a/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -395,6 +395,50 @@
   let Inst{5-0} = S12{11-6};
 }
 
+// 1-register, signed 12-bit immediate Dual Operand instruction.
+// This instruction uses B as the first operand (i.e., lr B, [%count0]).
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0]  | 1| 0|            subop| F|B[5-3]  |S12[5-0]     |S12[11-6]  |
+class F32_SOP_RS12<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+                   string asmstr, list<dag> pattern> :
+  InstARC<4, outs, ins, asmstr, pattern> {
+  bits<6> B;
+  bits<12> S12;
+
+  let Inst{31-27} = major;
+  let Inst{26-24} = B{2-0};
+  let Inst{23-22} = 0b10;
+  let Inst{21-16} = subop;
+  let Inst{15} = F;
+  let Inst{14-12} = B{5-3};
+  let Inst{11-6} = S12{5-0};
+  let Inst{5-0} = S12{11-6};
+
+  let DecoderMethod = "DecodeSOPwithRS12";
+}
+
+// 1-register, unsigned 6-bit immediate Dual Operand instruction.
+// This instruction uses B as the first operand.
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0]  | 0| 1|            subop| F|B[5-3]  |U6           |0|0|0|0|0|0|
+class F32_SOP_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+                  string asmstr, list<dag> pattern> :
+  InstARC<4, outs, ins, asmstr, pattern> {
+  bits<6> B;
+  bits<6> U6;
+
+  let Inst{31-27} = major;
+  let Inst{26-24} = B{2-0};
+  let Inst{23-22} = 0b01;
+  let Inst{21-16} = subop;
+  let Inst{15} = F;
+  let Inst{14-12} = B{5-3};
+  let Inst{11-6} = U6;
+  let Inst{5-0} = 0;
+
+  let DecoderMethod = "DecodeSOPwithRU6";
+}
+
 // 2-register, 32-bit immediate (LImm) Dual Operand instruction.
 // This instruction has the 32-bit immediate in bits 32-63, and
 // 62 in the C register operand slot, but is otherwise F32_DOP_RR.
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td
--- a/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -270,6 +270,19 @@
   def _rrlimm : Pat<(InFrag i32:$B, imm32:$LImm), (RRLImm i32:$B, imm32:$LImm)>;
 }
 
+// NOTE: This could be specialized later with a custom `PrintMethod` for
+//       displaying the aux register name. E.g. `[%count0]` instead of [33].
+def AuxReg : Operand<i32>;
+
+def LR_rs12 : F32_SOP_RS12<0b00100, 0b101010, 0,
+                           (outs GPR32:$B), (ins AuxReg:$C),
+                           "lr\t$B, [$C]", []>;
+def LR_ru6 : F32_SOP_RU6<0b00100, 0b101010, 0,
+                         (outs GPR32:$B), (ins AuxReg:$C),
+                         "lr\t$B, [$C]", []>;
+
+def: Pat<(i32 readcyclecounter), (LR_rs12 0x21) >;  // read timer
+
 // ---------------------------------------------------------------------------
 // Instruction definitions and patterns for 3 operand binary instructions.
 // ---------------------------------------------------------------------------
diff --git a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
--- a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -107,6 +107,12 @@
 static DecodeStatus DecodeLdRLImmInstruction(MCInst &, uint64_t, uint64_t,
                                              const void *);
 
+static DecodeStatus DecodeSOPwithRS12(MCInst &, uint64_t, uint64_t,
+                                      const void *);
+
+static DecodeStatus DecodeSOPwithRU6(MCInst &, uint64_t, uint64_t,
+                                     const void *);
+
 static DecodeStatus DecodeCCRU6Instruction(MCInst &, uint64_t, uint64_t,
                                            const void *);
 
@@ -311,6 +317,29 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSOPwithRU6(MCInst &Inst, uint64_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  unsigned DstB = decodeBField(Insn);
+  DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
+  using Field = decltype(Insn);
+  Field U6 = fieldFromInstruction(Insn, 6, 6);
+  Inst.addOperand(MCOperand::createImm(U6));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSOPwithRS12(MCInst &Inst, uint64_t Insn,
+                                      uint64_t Address, const void *Decoder) {
+  unsigned DstB = decodeBField(Insn);
+  DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
+  using Field = decltype(Insn);
+  Field Lower = fieldFromInstruction(Insn, 6, 6);
+  Field Upper = fieldFromInstruction(Insn, 0, 5);
+  Field Sign = fieldFromInstruction(Insn, 5, 1) ? -1 : 1;
+  Field Result = Sign * ((Upper << 6) + Lower);
+  Inst.addOperand(MCOperand::createImm(Result));
+  return MCDisassembler::Success;
+}
+
 DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                              ArrayRef<uint8_t> Bytes,
                                              uint64_t Address,
diff --git a/llvm/test/CodeGen/ARC/intrinsics.ll b/llvm/test/CodeGen/ARC/intrinsics.ll
--- a/llvm/test/CodeGen/ARC/intrinsics.ll
+++ b/llvm/test/CodeGen/ARC/intrinsics.ll
@@ -4,20 +4,29 @@
 
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.readcyclecounter()
 
-; CHECK-LABEL: clz32:
+; CHECK-LABEL: test_ctlz_i32:
 ; CHECK:       fls.f   %r0, %r0
 ; CHECK-NEXT:  mov.eq  %r0, 32
 ; CHECK-NEXT:  rsub.ne %r0, %r0, 31
-define i32 @clz32(i32 %x) {
+define i32 @test_ctlz_i32(i32 %x) {
   %a = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
   ret i32 %a
 }
 
-; CHECK-LABEL: ctz32:
+; CHECK-LABEL: test_cttz_i32:
 ; CHECK:       ffs.f   %r0, %r0
 ; CHECK-NEXT:  mov.eq  %r0, 32
-define i32 @ctz32(i32 %x) {
+define i32 @test_cttz_i32(i32 %x) {
   %a = call i32 @llvm.cttz.i32(i32 %x, i1 false)
   ret i32 %a
 }
+
+; CHECK-LABEL: test_readcyclecounter:
+; CHECK:       lr %r0, [33]
+; CHECK-NEXT:  mov %r1, 0
+define i64 @test_readcyclecounter() nounwind {
+  %a = call i64 @llvm.readcyclecounter()
+  ret i64 %a
+}
diff --git a/llvm/test/MC/Disassembler/ARC/ldst.txt b/llvm/test/MC/Disassembler/ARC/ldst.txt
--- a/llvm/test/MC/Disassembler/ARC/ldst.txt
+++ b/llvm/test/MC/Disassembler/ARC/ldst.txt
@@ -92,3 +92,35 @@
 
 # CHECK: stb.di.ab   %r0, [%r9,64]
 0x40 0x19 0x32 0x10
+
+# LR instructions with a U6 immediate bit pattern
+# ([33] maps to the [%count0] auxilary register)
+
+# CHECK: lr %r0, [33]
+0x6a 0x20 0x40 0x08
+
+# CHECK: lr %r7, [33]
+0x6a 0x27 0x40 0x08
+
+# CHECK: lr %r15, [33]
+0x6a 0x27 0x40 0x18
+
+# CHECK: lr %r22, [33]
+0x6a 0x26 0x40 0x28
+
+# LR instructions with an S12 immediate bit pattern
+
+# CHECK: lr %r0, [33]
+0xaa 0x20 0x40 0x08
+
+# The following don't necessarily map to real auxilary registers, but
+# the different range of numbers helps exercise the S12 decoder.
+
+# CHECK: lr %r0, [-33]
+0xaa 0x20 0x60 0x08
+
+# CHECK: lr %r0, [97]
+0xaa 0x20 0x41 0x08
+
+# CHECK: lr %r0, [-97]
+0xaa 0x20 0x61 0x08