Index: include/llvm/CodeGen/ValueTypes.h
===================================================================
--- include/llvm/CodeGen/ValueTypes.h
+++ include/llvm/CodeGen/ValueTypes.h
@@ -89,6 +89,16 @@
       return VecTy;
     }
 
+    EVT changeTypeToInteger() {
+      if (isVector())
+        return changeVectorElementTypeToInteger();
+
+      if (isSimple())
+        return MVT::getIntegerVT(getSizeInBits());
+
+      return changeExtendedTypeToInteger();
+    }
+
     /// isSimple - Test if the given EVT is simple (as opposed to being
     /// extended).
     bool isSimple() const {
@@ -342,6 +352,7 @@
     // Methods for handling the Extended-type case in functions above.
     // These are all out-of-line to prevent users of this header file
     // from having a dependency on Type.h.
+    EVT changeExtendedTypeToInteger() const;
     EVT changeExtendedVectorElementTypeToInteger() const;
     static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth);
     static EVT getExtendedVectorVT(LLVMContext &C, EVT VT,
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1095,7 +1095,8 @@
       break;
     }
     case TargetLowering::Expand:
-      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) {
+      EVT DestVT = Node->getValueType(0);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
         // If the source type is not legal, see if there is a legal extload to
         // an intermediate type that we can then extend further.
         EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
@@ -1114,6 +1115,23 @@
           Chain = Load.getValue(1);
           break;
         }
+
+        // Handle the special case of fp16 extloads. EXTLOAD doesn't have the
+        // normal undefined upper bits behavior to allow using an in-reg extend
+        // with the illegal FP type, so load as an integer and do the
+        // from-integer conversion.
+        if (SrcVT.getScalarType() == MVT::f16) {
+          EVT ISrcVT = SrcVT.changeTypeToInteger();
+          EVT IDestVT = DestVT.changeTypeToInteger();
+          EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
+                                          Chain, Ptr, ISrcVT,
+                                          LD->getMemOperand());
+          Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
+          Chain = Result.getValue(1);
+          break;
+        }
       }
 
       assert(!SrcVT.isVector() &&
Index: lib/IR/ValueTypes.cpp
===================================================================
--- lib/IR/ValueTypes.cpp
+++ lib/IR/ValueTypes.cpp
@@ -19,6 +19,11 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+EVT EVT::changeExtendedTypeToInteger() const {
+  LLVMContext &Context = LLVMTy->getContext();
+  return getIntegerVT(Context, getSizeInBits());
+}
+
 EVT EVT::changeExtendedVectorElementTypeToInteger() const {
   LLVMContext &Context = LLVMTy->getContext();
   EVT IntTy = getIntegerVT(Context, getVectorElementType().getSizeInBits());
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -461,12 +461,6 @@
   return TII->isInlineConstant(Imm);
 }
 
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
@@ -490,30 +484,10 @@
 
   unsigned Align = DL.getABITypeAlignment(Ty);
 
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    SDValue Ops[] = {
-      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
-      Load.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, SL);
-  }
-
   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (MemVT.isFloatingPoint())
+    ExtTy = ISD::EXTLOAD;
+
   return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                      false, // isVolatile
Index: test/CodeGen/AMDGPU/half.ll
===================================================================
--- test/CodeGen/AMDGPU/half.ll
+++ test/CodeGen/AMDGPU/half.ll
@@ -112,12 +112,24 @@
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
+; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
 define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
 }
+
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
@@ -125,6 +137,16 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
@@ -132,6 +154,19 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: s_endpgm
 define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
@@ -139,6 +174,37 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+; GCN-DAG: buffer_load_ushort v
+
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+; GCN-DAG: v_cvt_f64_f32_e32
+
+; GCN: s_endpgm
 define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
@@ -194,6 +260,12 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
+; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
+; GCN: s_endpgm
 define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
@@ -246,6 +318,15 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
+; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: v_cvt_f64_f32_e32 [[CVT2:v\[[0-9]+:[0-9]+\]]], v[[CVT0]]
+; GCN-DAG: v_cvt_f64_f32_e32 [[CVT3:v\[[0-9]+:[0-9]+\]]], v[[CVT1]]
+; GCN-DAG: buffer_store_dwordx2 [[CVT2]]
+; GCN-DAG: buffer_store_dwordx2 [[CVT3]]
+; GCN: s_endpgm
 define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>