Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -185,6 +185,28 @@ }]; } +class is_canonicalized : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ + const SITargetLowering &Lowering = + *static_cast(getTargetLowering()); + + return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) && + Lowering.isCanonicalized(*CurDAG, N->getOperand(1)); + }]> { + + // TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class + let GISelPredicateCode = [{ + const SITargetLowering *TLI = static_cast( + MF.getSubtarget().getTargetLowering()); + + return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast(MF)) && + TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast(MF)); + }]; +} + + let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp; def smin_oneuse : HasOneUseBinOp; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9634,19 +9634,18 @@ // Could be anything. return false; - case ISD::BITCAST: { + case ISD::BITCAST: + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); + case ISD::TRUNCATE: { // Hack round the mess we make when legalizing extract_vector_elt - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i16 && - Src.getOpcode() == ISD::TRUNCATE) { - SDValue TruncSrc = Src.getOperand(0); + if (Op.getValueType() == MVT::i16) { + SDValue TruncSrc = Op.getOperand(0); if (TruncSrc.getValueType() == MVT::i32 && TruncSrc.getOpcode() == ISD::BITCAST && TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); } } - return false; } case ISD::INTRINSIC_WO_CHAIN: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2385,9 +2385,13 @@ (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), + (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), + (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) +>; } // End SubtargetPredicate = HasVOP3PInsts - def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -288,8 +288,7 @@ ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]] ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} -; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]] -; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] +; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]] ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] Index: llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -552,7 +552,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16: ; GFX9: s_waitcnt ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 ; GFX9-NEXT: s_setpc_b64 ; High bits known zero @@ -635,9 +635,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16: ; GFX9: s_waitcnt ; GFX9-DAG: v_max_f16_e32 v0, v0, v0 -; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000 -; GFX9: v_and_b32_e32 v0, 0xffff, v0 -; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0 +; GFX9: v_pack_b32_f16 v0, v0, 2.0 ; GFX9: s_setpc_b64 ; VI: s_waitcnt @@ -653,8 +651,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16: ; GFX9: v_max_f16_e32 v0, v0, v0 -; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000 -; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]] +; GFX9: v_pack_b32_f16 v0, 2.0, v0 ; GFX9: s_setpc_b64 ; VI: s_waitcnt @@ -680,8 +677,8 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16: ; GFX9: s_waitcnt ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_setpc_b64 ; VI: s_waitcnt @@ -721,7 +718,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: s_setpc_b64 Index: llvm/test/CodeGen/AMDGPU/fexp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fexp.ll +++ llvm/test/CodeGen/AMDGPU/fexp.ll @@ -137,8 +137,7 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0] ; GFX9-NEXT: v_exp_f16_e32 v1, v0 ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0) ret <2 x half> %result @@ -198,14 +197,11 @@ ; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]] -; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]] -; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]] -; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]] -; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff -; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]] -; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]] -; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]] -; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL3]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL4]] +; GFX9-NEXT: v_pack_b32_f16 v1, [[EXP1]], [[EXP2]] +; GFX9-NEXT: v_pack_b32_f16 v0, [[EXP3]], [[EXP4]] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0) ret <4 x half> %result Index: llvm/test/CodeGen/AMDGPU/fpow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fpow.ll +++ llvm/test/CodeGen/AMDGPU/fpow.ll @@ -190,21 +190,20 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 +; GFX9-NEXT: v_exp_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX10-NEXT: v_log_f32_e32 v2, v2 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 @@ -213,8 +212,7 @@ ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -274,31 +272,29 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 +; GFX9-NEXT: v_exp_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_log_f32_e32 v2, v2 -; GFX10-NEXT: v_log_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_exp_f32_e32 v1, v2 -; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +;GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0 +;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +;GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 +;GFX10-NEXT: v_log_f32_e32 v2, v2 +;GFX10-NEXT: v_log_f32_e32 v0, v0 +;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +;GFX10-NEXT: v_exp_f32_e32 v1, v2 +;GFX10-NEXT: v_exp_f32_e32 v0, v0 +;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -359,31 +355,29 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 +; GFX9-NEXT: v_exp_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_log_f32_e32 v2, v2 -; GFX10-NEXT: v_log_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_exp_f32_e32 v1, v2 -; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +;GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +;GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1 +;GFX10-NEXT: v_log_f32_e32 v2, v2 +;GFX10-NEXT: v_log_f32_e32 v0, v0 +;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +;GFX10-NEXT: v_exp_f32_e32 v1, v2 +;GFX10-NEXT: v_exp_f32_e32 v0, v0 +;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -449,21 +443,20 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 +; GFX9-NEXT: v_exp_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; GFX10-NEXT: v_log_f32_e32 v2, v2 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 @@ -472,8 +465,7 @@ ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y Index: llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -44,8 +44,7 @@ ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] +; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frem.ll +++ llvm/test/CodeGen/AMDGPU/frem.ll @@ -1514,8 +1514,7 @@ ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4 ; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1550,8 +1549,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %in2) #0 { @@ -1862,29 +1860,26 @@ ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v5, v3, v5 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX9-NEXT: v_rcp_f32_e32 v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 ; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 -; GFX9-NEXT: v_fma_f16 v5, -v5, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX9-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX9-NEXT: v_div_fixup_f16 v6, v6, v2, v0 -; GFX9-NEXT: v_trunc_f16_e32 v6, v6 -; GFX9-NEXT: v_fma_f16 v0, -v6, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1919,30 +1914,27 @@ ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v3, v6 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2 +; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { Index: llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -135,10 +135,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cos_f16_e32 v3, v3 +; GFX9-NEXT: v_cos_f16_e32 v2, v3 ; GFX9-NEXT: v_cos_f16_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -146,16 +145,15 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_cos_f16_e32 v2, v2 +; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_cos_f16_e32 v2, v3 ; GFX10-NEXT: v_cos_f16_e32 v1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a Index: llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -55,8 +55,7 @@ ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; VI-NOT: v_and_b32_e32 ; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]] -; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]] -; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]] +; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]] ; SI: buffer_store_dword v[[R_F32_5]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]] ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]] Index: llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -55,8 +55,7 @@ ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; VI-NOT: v_and_b32_e32 ; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]] -; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]] -; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]] +; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]] ; SI: buffer_store_dword v[[R_F32_5]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]] ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]] Index: llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -43,8 +43,7 @@ ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] +; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/llvm.round.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -87,8 +87,7 @@ ; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]], ; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]], -; GFX9: v_and_b32_e32 -; GFX9: v_lshl_or_b32 +; GFX9: v_pack_b32_f16 define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 { %in = bitcast i32 %in.arg to <2 x half> %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in) Index: llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -135,10 +135,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_sin_f16_e32 v3, v3 +; GFX9-NEXT: v_sin_f16_e32 v2, v3 ; GFX9-NEXT: v_sin_f16_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -146,16 +145,15 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_sin_f16_e32 v2, v2 +; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_sin_f16_e32 v2, v3 ; GFX10-NEXT: v_sin_f16_e32 v1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a Index: llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -225,10 +225,9 @@ ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: ; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9: v_cvt_f16_f32_e32 v0, v0 ; GFX9: v_cvt_f16_f32_e32 v1, v3 -; GFX9: v_and_b32_e32 v0, 0xffff, v0 -; GFX9: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9: v_cvt_f16_f32_e32 v0, v0 +; GFX9: v_pack_b32_f16 v0, v0, v1 ; GFX9: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -247,11 +246,10 @@ ; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> Index: llvm/test/CodeGen/AMDGPU/v_pack.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +define amdgpu_kernel void @v_pack_b32_v2half(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { +; GCN-LABEL: v_pack_b32_v2half: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +; +; GISEL-LABEL: v_pack_b32_v2half: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GISEL-NEXT: s_movk_i32 s0, 0x4000 +; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; use v0 +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext + %v0 = load volatile half, half addrspace(1)* %in0.gep + %v1 = load volatile half, half addrspace(1)* %in1.gep + %v0.add = fadd half %v0, 2.0 + %v1.add = fadd half %v1, 2.0 + %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +define amdgpu_kernel void @v_pack_b32_v2half_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { +; GCN-LABEL: v_pack_b32_v2half_sub: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +; +; GISEL-LABEL: v_pack_b32_v2half_sub: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, 0x4000 +; GISEL-NEXT: v_add_f16_e32 v1, -2.0, v1 +; GISEL-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; use v0 +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext + %v0 = load volatile half, half addrspace(1)* %in0.gep + %v1 = load volatile half, half addrspace(1)* %in1.gep + %v0.add = fsub half %v0, 2.0 + %v1.add = fadd half %v1, 2.0 + %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +define amdgpu_kernel void @fptrunc( +; GCN-LABEL: fptrunc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0x31016000 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; GISEL-LABEL: fptrunc: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-NEXT: s_endpgm + <2 x half> addrspace(1)* %r, + <2 x float> addrspace(1)* %a) { + %a.val = load <2 x float>, <2 x float> addrspace(1)* %a + %r.val = fptrunc <2 x float> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + +define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { +; GCN-LABEL: v_pack_b32.fabs: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1| +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +; +; GISEL-LABEL: v_pack_b32.fabs: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GISEL-NEXT: s_movk_i32 s0, 0x7fff +; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v0, s0, v0 +; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; use v0 +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext + %v0 = load volatile half, half addrspace(1)* %in0.gep + %v1 = load volatile half, half addrspace(1)* %in1.gep + %v0.add = fadd half %v0, 2.0 + %v1.add = fadd half %v1, 2.0 + %v0.fabs = call half @llvm.fabs.f16(half %v0.add) + %v1.fabs = call half @llvm.fabs.f16(half %v1.add) + %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { +; GCN-LABEL: v_pack_b32.fneg: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +; +; GISEL-LABEL: v_pack_b32.fneg: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GISEL-NEXT: s_mov_b32 s0, 0x8000 +; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0 +; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; use v0 +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext + %v0 = load volatile half, half addrspace(1)* %in0.gep + %v1 = load volatile half, half addrspace(1)* %in1.gep + %v0.add = fadd half %v0, 2.0 + %v1.add = fadd half %v1, 2.0 + %v0.fneg = fsub half -0.0, %v0.add + %v1.fneg = fsub half -0.0, %v1.add + %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare half @llvm.fabs.f16(half) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +