Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -977,6 +977,15 @@ case AMDGPU::V_ASHRREV_I32_e32: Result = static_cast(RHS) >> (LHS & 31); return true; + case AMDGPU::S_PACK_LL_B32_B16: + Result = (LHS & 0xffff) | ((RHS & 0xffff) << 16); + return true; + case AMDGPU::S_PACK_LH_B32_B16: + Result = (LHS & 0xffff) | (RHS & 0xffff0000); + return true; + case AMDGPU::S_PACK_HH_B32_B16: + Result = ((LHS & 0xffff0000) >> 16) | (RHS & 0xffff0000); + return true; default: return false; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -122,19 +122,15 @@ ; GFX906-LABEL: v_fdot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_movk_i32 s4, 0x4000 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX906-NEXT: v_dot2_f32_f16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x4000 +; GFX10-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_dot2_f32_f16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) ret float %ret @@ -144,19 +140,15 @@ ; GFX906-LABEL: v_fdot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_movk_i32 s4, 0x4000 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x4000 +; GFX10-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) ret float %ret Index: llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -791,3 +791,90 @@ S_ENDPGM 0, implicit %4 ... + +--- +name: fold_pack_ll_imm_imm +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true + +body: | + bb.0: + ;liveins: $vgpr0 + + ; GCN-LABEL: name: fold_pack_ll_imm_imm + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 196609 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65539 + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]] + ; GCN: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %1:sreg_32 = S_MOV_B32 1 + %2:sreg_32 = S_MOV_B32 3 + %3:sreg_32 = S_PACK_LL_B32_B16 %1, %2 + %4:sreg_32 = S_PACK_LL_B32_B16 %2, %1 + $sgpr0 = COPY %3:sreg_32 + $sgpr1 = COPY %4:sreg_32 + SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + +... + +--- +name: fold_pack_lh_imm_imm +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true + +body: | + bb.0: + ;liveins: $vgpr0 + + ; GCN-LABEL: name: fold_pack_lh_imm_imm + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65537 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]] + ; GCN: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %1:sreg_32 = S_MOV_B32 1 + %2:sreg_32 = S_MOV_B32 65536 + %3:sreg_32 = S_PACK_LH_B32_B16 %1, %2 + %4:sreg_32 = S_PACK_LH_B32_B16 %2, %1 + $sgpr0 = COPY %3:sreg_32 + $sgpr1 = COPY %4:sreg_32 + SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + +... + +--- +name: fold_pack_hh_imm_imm +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true + +body: | + bb.0: + ;liveins: $vgpr0 + + ; GCN-LABEL: name: fold_pack_hh_imm_imm + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65538 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 131073 + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]] + ; GCN: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %1:sreg_32 = S_MOV_B32 131072 + %2:sreg_32 = S_MOV_B32 65536 + %3:sreg_32 = S_PACK_HH_B32_B16 %1, %2 + %4:sreg_32 = S_PACK_HH_B32_B16 %2, %1 + $sgpr0 = COPY %3:sreg_32 + $sgpr1 = COPY %4:sreg_32 + SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + +...