Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2396,6 +2396,12 @@ (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2f16 (build_vector (f16 (bitconvert (i16 (trunc VGPR_32:$src0)))), + (f16 (bitconvert (i16 (trunc VGPR_32:$src1)))))), + (V_PACK_B32_F16_e64 SRCMODS.NONE, VGPR_32:$src0, SRCMODS.NONE, VGPR_32:$src1) +>; + } // End SubtargetPredicate = HasVOP3PInsts Index: llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -179,8 +179,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND Index: llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -74,8 +74,7 @@ ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] +; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1671,16 +1671,14 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s7, 4 +; GFX9-NEXT: v_pack_b32_f16 v3, s6, s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 -; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/pack.v2f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}s_pack_v2f16: ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_load_dword [[VAL1:s[0-9]+]] -; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] +; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[PACKED]] ; GFX9: ; use [[PACKED]] define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 { %val0 = load volatile i32, i32 addrspace(4)* %in0 @@ -59,8 +59,7 @@ ; GFX9: global_load_dword [[VAL0:v[0-9]+]] ; GFX9: global_load_dword [[VAL1:v[0-9]+]] -; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] +; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -84,8 +83,7 @@ ; GFX9: global_load_dword [[VAL0:v[0-9]+]] ; GFX9: global_load_dword [[VAL1:v[0-9]+]] -; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] +; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { Index: llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -113,12 +113,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[2:3], off ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -133,13 +132,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -399,10 +397,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_pack_b32_f16 v1, v0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -418,10 +415,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -439,11 +435,11 @@ ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -458,11 +454,12 @@ ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pack_b32_f16 v0, v1, v5 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -478,12 +475,12 @@ ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -565,13 +562,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -583,13 +578,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_pack_b32_f16 v0, v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -605,8 +599,7 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -621,8 +614,7 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -697,10 +689,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -716,8 +707,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 @@ -730,10 +720,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 @@ -816,13 +805,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1