Skip to content

Commit d3b2b97

Browse files
committedSep 25, 2019
[AMDGPU] gfx10 v_fmac_f16 operand folding
Fold immediates into v_fmac_f16. Differential Revision: https://reviews.llvm.org/D68037 llvm-svn: 372906
1 parent ac3243c commit d3b2b97

File tree

2 files changed

+20
-13
lines changed

2 files changed

+20
-13
lines changed
 

‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
142142
switch (Opc) {
143143
case AMDGPU::V_MAC_F32_e64:
144144
case AMDGPU::V_MAC_F16_e64:
145-
case AMDGPU::V_FMAC_F32_e64: {
145+
case AMDGPU::V_FMAC_F32_e64:
146+
case AMDGPU::V_FMAC_F16_e64: {
146147
// Special case for mac. Since this is replaced with mad when folded into
147148
// src2, we need to check the legality for the final instruction.
148149
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
149150
if (static_cast<int>(OpNo) == Src2Idx) {
150-
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
151-
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
151+
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
152+
Opc == AMDGPU::V_FMAC_F16_e64;
153+
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
154+
Opc == AMDGPU::V_FMAC_F32_e64;
152155

153156
unsigned Opc = IsFMA ?
154-
AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
157+
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
158+
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
155159
const MCInstrDesc &MadDesc = TII->get(Opc);
156160
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
157161
}
@@ -314,12 +318,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
314318
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
315319
unsigned Opc = MI->getOpcode();
316320
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
317-
Opc == AMDGPU::V_FMAC_F32_e64) &&
321+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
318322
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
319-
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
320-
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
323+
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
324+
Opc == AMDGPU::V_FMAC_F16_e64;
325+
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
326+
Opc == AMDGPU::V_FMAC_F32_e64;
321327
unsigned NewOpc = IsFMA ?
322-
AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
328+
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
329+
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
323330

324331
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
325332
// to fold the operand.

‎llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, f
126126
; GFX8_10: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
127127
; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
128128
; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
129-
; GFX10-DENORM: v_fmac_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
129+
; GFX10-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
130130
; GFX10-FLUSH: v_sub_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
131131
define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
132132
%x = bitcast i16 %x.arg to half
@@ -152,7 +152,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i
152152
; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
153153
; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}}
154154
; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
155-
; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
155+
; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}}
156156

157157
; GCN-DAG: buffer_store_short [[MUL2]]
158158
; GCN-DAG: buffer_store_short [[MAD]]
@@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i
174174
; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
175175
; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
176176
; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
177-
; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], |[[X]]|, 2.0
177+
; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}}
178178

179179
; GCN-DAG: buffer_store_short [[MUL2]]
180180
; GCN-DAG: buffer_store_short [[MAD]]
@@ -201,8 +201,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i
201201
; GFX10-FLUSH: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |{{s[0-9]+}}|
202202
; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
203203
; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
204-
; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0
205-
; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X]]|, 2.0
204+
; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, s{{[0-9]+}}
205+
; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, s{{[0-9]+}}
206206

207207
define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
208208
%x = bitcast i16 %x.arg to half

0 commit comments

Comments
 (0)
Please sign in to comment.