Index: lib/Target/R600/AMDGPUISelLowering.h
===================================================================
--- lib/Target/R600/AMDGPUISelLowering.h
+++ lib/Target/R600/AMDGPUISelLowering.h
@@ -196,6 +196,14 @@
   SAMPLEB,
   SAMPLED,
   SAMPLEL,
+
+  // Convert the low 16-bits of the 32-bit integer input to a float.
+  // XXX - Would it be better to handle this is a subregister.
+  CVT_F16_F32,
+
+  // Convert float to half and store in the low 16-bits of the 32-bit integer
+  // output.
+  CVT_F32_F16,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
Index: lib/Target/R600/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/R600/AMDGPUISelLowering.cpp
+++ lib/Target/R600/AMDGPUISelLowering.cpp
@@ -84,6 +84,24 @@
 
 #include "AMDGPUGenCallingConv.inc"
 
+static EVT getFloatVT(LLVMContext &Context, EVT VT) {
+  return VT.isVector() ?
+    EVT::getVectorVT(Context, MVT::f32, VT.getVectorNumElements()) :
+    MVT::f32;
+}
+
+static EVT getShortVT(LLVMContext &Context, EVT VT) {
+  return VT.isVector() ?
+    EVT::getVectorVT(Context, MVT::i16, VT.getVectorNumElements()) :
+    MVT::i16;
+}
+
+static EVT getIntVT(LLVMContext &Context, EVT VT) {
+  return VT.isVector() ?
+    EVT::getVectorVT(Context, MVT::i32, VT.getVectorNumElements()) :
+    MVT::i32;
+}
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
@@ -154,6 +172,9 @@
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
 
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Custom);
+  setTruncStoreAction(MVT::f32, MVT::f16, Custom);
+  setTruncStoreAction(MVT::f64, MVT::f16, Custom);
 
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
@@ -927,6 +948,28 @@
   EVT VT = Op.getValueType();
   EVT MemVT = Load->getMemoryVT();
 
+  EVT MemEltVT = MemVT.getScalarType();
+  if (MemEltVT == MVT::f16) {
+    assert(ExtType != ISD::NON_EXTLOAD);
+
+    EVT ShortVT = getShortVT(*DAG.getContext(), VT);
+    EVT IntVT = getIntVT(*DAG.getContext(), VT);
+    EVT FloatVT = getFloatVT(*DAG.getContext(), VT);
+
+    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, IntVT,
+                                       Load->getChain(),
+                                       Load->getBasePtr(),
+                                       ShortVT,
+                                       Load->getMemOperand());
+
+    SDValue Conv = DAG.getNode(AMDGPUISD::CVT_F32_F16, DL, FloatVT, ExtLoad32);
+
+    if (VT.getScalarType() == MVT::f32)
+      return Conv;
+
+    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Conv);
+  }
+
   if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
     // We can do the extload to 32-bits, and then need to separately extend to
     // 64-bits.
@@ -986,7 +1029,6 @@
 
   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
 
-  EVT MemEltVT = MemVT.getScalarType();
   if (ExtType == ISD::SEXTLOAD) {
     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
@@ -1011,6 +1053,28 @@
   }
 
   EVT MemVT = Store->getMemoryVT();
+
+  EVT MemEltVT = MemVT.getScalarType();
+  if (MemEltVT == MVT::f16) {
+    assert(Store->isTruncatingStore());
+
+    SDValue Val = Store->getValue();
+    EVT VT = Val.getValueType();
+
+    if (VT.getScalarType() != MVT::f32) {
+      Val = DAG.getNode(ISD::FP_ROUND, DL, getFloatVT(*DAG.getContext(), VT),
+                        Val, DAG.getConstant(0, MVT::i32));
+    }
+
+    SDValue Conv = DAG.getNode(AMDGPUISD::CVT_F16_F32, DL, MVT::i32, Val);
+    EVT ShortVT = VT.isVector() ?
+      EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()) :
+      MVT::i16;
+
+    return DAG.getTruncStore(Chain, DL, Conv, Store->getBasePtr(), ShortVT,
+                             Store->getMemOperand());
+  }
+
   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
       MemVT.bitsLT(MVT::i32)) {
     unsigned Mask = 0;
@@ -1407,6 +1471,8 @@
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
   NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F16_F32)
+  NODE_NAME_CASE(CVT_F32_F16)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
@@ -1460,6 +1526,11 @@
     computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
                               KnownZero, KnownOne, DAG, Depth);
     break;
+
+  case AMDGPUISD::CVT_F16_F32:
+    KnownZero = APInt::getHighBitsSet(32, 16);
+    break;
+
   default:
     break;
   }
Index: lib/Target/R600/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/R600/AMDGPUInstrInfo.td
+++ lib/Target/R600/AMDGPUInstrInfo.td
@@ -100,3 +100,18 @@
 def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
   [SDNPCommutative]
 >;
+
+// half is treated as a 32-bit integer.
+def AMDGPUF16_F32Op  : SDTypeProfile<1, 1,
+  [SDTCisInt<0>, SDTCisFP<1>]>;
+
+  def AMDGPUF32_F16Op  : SDTypeProfile<1, 1,
+  [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def AMDGPUcvt_f16_f32 : SDNode<"AMDGPUISD::CVT_F16_F32", AMDGPUF16_F32Op,
+  []
+>;
+
+def AMDGPUcvt_f32_f16 : SDNode<"AMDGPUISD::CVT_F32_F16", AMDGPUF32_F16Op,
+  []
+>;
Index: lib/Target/R600/SIInstructions.td
===================================================================
--- lib/Target/R600/SIInstructions.td
+++ lib/Target/R600/SIInstructions.td
@@ -962,8 +962,15 @@
   [(set i32:$dst, (fp_to_sint f32:$src0))]
 >;
 defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
-////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
-//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+
+defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
+  [(set i32:$dst, (AMDGPUcvt_f16_f32 f32:$src0))]
+>;
+
+defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
+  [(set f32:$dst, (AMDGPUcvt_f32_f16 i32:$src0))]
+>;
+
 //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
 //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
 //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
Index: test/CodeGen/R600/half.ll
===================================================================
--- /dev/null
+++ test/CodeGen/R600/half.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @extload_half_to_float
+; SI: BUFFER_LOAD_USHORT [[REG:v[0-9]+]],
+; SI: V_CVT_F32_F16_e32 v{{[0-9]+}}, [[REG]]
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @extload_half_to_float(float addrspace(1)* %out, half addrspace(1)* %in) nounwind {
+  %load = load half addrspace(1)* %in, align 2
+  %ext = fpext half %load to float
+  store float %ext, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @extload_half_to_double
+; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]],
+; SI: V_CVT_F32_F16_e32 [[CONVREG0:v[0-9]+]], [[LOADREG]]
+; SI: V_CVT_F64_F32_e32 v{{\[[0-9]+:[0-9]+\]}}, [[CONVREG0]]
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @extload_half_to_double(double addrspace(1)* %out, half addrspace(1)* %in) nounwind {
+  %load = load half addrspace(1)* %in, align 2
+  %ext = fpext half %load to double
+  store double %ext, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @truncstore_float_to_half
+; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]],
+; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI: S_ENDPGM
+define void @truncstore_float_to_half(half addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float addrspace(1)* %in, align 4
+  %ext = fptrunc float %load to half
+  store half %ext, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; SI-LABEL: @truncstore_double_to_half
+; SI: V_CVT_F32_F64_e32 [[CVTFLOAT:v[0-9]+]],
+; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[CVTFLOAT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI: S_ENDPGM
+define void @truncstore_double_to_half(half addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+  %load = load double addrspace(1)* %in, align 8
+  %ext = fptrunc double %load to half
+  store half %ext, half addrspace(1)* %out, align 2
+  ret void
+}