Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,10 +13,12 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -233,6 +235,7 @@
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+  bool SelectV_PACK_B32_F16(SDNode *Node, EVT VT);
 
   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
@@ -775,6 +778,9 @@
         }
       }
 
+      if (SelectV_PACK_B32_F16(N, VT))
+        return;
+
       break;
     }
 
@@ -2678,6 +2684,56 @@
   SelectCode(N);
 }
 
+bool AMDGPUDAGToDAGISel::SelectV_PACK_B32_F16(SDNode *Node, EVT VT) {
+  /*
+  Match this pattern when it's safe to do so
+  (v2f16 (build_vector (f16 (bitconvert (i16 (trunc VGPR_32:$src0)))),
+                       (f16 (bitconvert (i16 (trunc VGPR_32:$src1)))))),
+  (V_PACK_B32_F16_e64 SRCMODS.NONE, VGPR_32:$src0, SRCMODS.NONE, VGPR_32:$src1)
+  */
+  unsigned int Opc = Node->getOpcode();
+  unsigned NumVectorElts = VT.getVectorNumElements();
+  if (VT.getScalarSizeInBits() == 16) {
+    if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2 &&
+        Node->getNumOperands() >= 2) {
+      SDValue LHS = Node->getOperand(0);
+      SDValue RHS = Node->getOperand(1);
+
+      if (LHS.getValueType() == MVT::f16 && RHS.getValueType() == MVT::f16 &&
+          LHS.getOpcode() == ISD::BITCAST && RHS.getOpcode() == ISD::BITCAST &&
+          LHS.getNumOperands() >= 1 && RHS.getNumOperands() >= 1) {
+
+        SDValue Op0 = LHS.getOperand(0);
+        SDValue Op1 = RHS.getOperand(0);
+
+        if (Op0.getOpcode() == ISD::TRUNCATE &&
+            Op1.getOpcode() == ISD::TRUNCATE &&
+            Op0.getValueType() == MVT::i16 && Op1.getValueType() == MVT::i16) {
+
+          const SITargetLowering &Lowering =
+              *static_cast<const SITargetLowering *>(getTargetLowering());
+
+          // Select this instruction if the inputs have been flushed/quieted
+          // already
+          if (!Lowering.isCanonicalized(*CurDAG, Op0.getOperand(0)) ||
+              !Lowering.isCanonicalized(*CurDAG, Op1.getOperand(0)))
+            return false;
+
+          SDValue Mods = CurDAG->getTargetConstant(0, {}, MVT::i32);
+          CurDAG->SelectNodeTo(
+              Node, AMDGPU::V_PACK_B32_F16_e64, Node->getVTList(),
+              {Mods, Op0, Mods, Op1, CurDAG->getTargetConstant(0, {}, MVT::i1),
+               Mods});
+
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
                                             unsigned &Mods,
                                             bool AllowAbs) const {
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9630,7 +9630,8 @@
           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
       }
-    }
+    } else if (Op.getValueType() == MVT::i32 && Src.getValueType() == MVT::f32)
+      return isCanonicalized(DAG, Src, MaxDepth - 1);
 
     return false;
   }
Index: llvm/test/CodeGen/AMDGPU/v_pack.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+
+; GCN-LABEL: v_pack_b32_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v0, 2.0, v1
+; GCN-NEXT:    v_add_f32_e32 v1, 2.0, v2
+; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use v0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+define amdgpu_kernel void @v_pack_b32_v2f16(float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds float, float addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds float, float addrspace(1)* %in1, i64 %tid.ext
+  %v0 = load volatile float, float addrspace(1)* %in0.gep
+  %v1 = load volatile float, float addrspace(1)* %in1.gep
+  %v0.mul = fadd float %v0, 2.0
+  %v1.add = fadd float %v1, 2.0
+  %val0 = bitcast float %v0.mul to i32
+  %val1 = bitcast float %v1.add to i32
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+