diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -540,6 +540,31 @@ // TODO: Add GISelPredicateCode for the ret and noret PatFrags once // GlobalISelEmitter allows pattern matches where src and dst def count // mismatch. + +multiclass ret_noret_op { + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return true; }] in { + def "_ret" : PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>; + } + + let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return false; }] in { + def "_noret" : PatFrag<(ops node:$ptr, node:$data), + (!cast(NAME) node:$ptr, node:$data)>; + } +} + +defm int_amdgcn_flat_atomic_fadd : ret_noret_op; +defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op; +defm int_amdgcn_flat_atomic_fmin : ret_noret_op; +defm int_amdgcn_flat_atomic_fmax : ret_noret_op; +defm int_amdgcn_global_atomic_fadd : ret_noret_op; +defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op; +defm int_amdgcn_global_atomic_fmin : ret_noret_op; +defm int_amdgcn_global_atomic_fmax : ret_noret_op; +defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op; + multiclass ret_noret_binary_atomic_op { let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], GISelPredicateCode = [{ return false; }] in { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1188,9 +1188,9 @@ let SubtargetPredicate = isGFX90AOnly; } - defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; - defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; - defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> { @@ -1381,10 +1381,11 @@ // buffer_atomic patterns //===----------------------------------------------------------------------===// -multiclass BufferAtomicPat { +multiclass BufferAtomicPat { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast(OpPrefix # "_" # RtnMode # "_" # vt.Size); + defvar Op = !cast(OpPrefix # "_" # RtnMode + # !if(isIntr, "", "_" # vt.Size)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); def : GCNPat< @@ -1592,6 +1593,9 @@ } let SubtargetPredicate = isGFX90APlus in { + defm : BufferAtomicPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64", 1>; + defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64", 1>; + defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64", 1>; defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1025,9 +1025,13 @@ let SubtargetPredicate = isGFX940Plus in { def : DSAtomicRetPat; def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), + (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)), (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; +def : GCNPat < + (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), + (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) +>; } def : Pat < diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -711,17 +711,17 @@ } // End SubtargetPredicate = isGFX7GFX10 let SubtargetPredicate = isGFX90APlus in { - defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>; - defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>; - defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>; - defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; - defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; - defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus let SubtargetPredicate = isGFX940Plus in { - defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>; - defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>; + defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>; defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>; defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>; } // End SubtargetPredicate = isGFX940Plus @@ -897,15 +897,15 @@ defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>; defm GLOBAL_ATOMIC_FMIN : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>; defm GLOBAL_ATOMIC_FMAX : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>; defm GLOBAL_ATOMIC_FMIN_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; defm GLOBAL_ATOMIC_FMAX_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 let is_flat_global = 1 in { @@ -920,10 +920,10 @@ let OtherPredicates = [isGFX90APlus] in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd + "global_atomic_add_f32", VGPR_32, f32 >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd + "global_atomic_pk_add_f16", VGPR_32, v2f16 >; } // End OtherPredicates = [isGFX90APlus] } // End is_flat_global = 1 @@ -1029,13 +1029,30 @@ (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +multiclass FlatSignedAtomicPat { + defvar rtnNode = !cast(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + + def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + + def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; +} + +multiclass FlatSignedIntrPat { + defm : FlatSignedAtomicPat; +} + class FlatSignedAtomicPatNoRtn : GCNPat < (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) >; -class FlatSignedAtomicPat : GCNPat < +class FlatSignedAtomicPatRtn : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) >; @@ -1237,7 +1254,7 @@ multiclass GlobalFLATAtomicPatsRtn { - def : FlatSignedAtomicPat (nortn_inst_name#"_RTN"), node, vt, data_vt> { + def : FlatSignedAtomicPatRtn (nortn_inst_name#"_RTN"), node, vt, data_vt> { let AddedComplexity = 10; } @@ -1247,13 +1264,12 @@ } multiclass GlobalFLATAtomicPats { - defvar rtnNode = !cast(node#"_ret_"#vt.Size); - defvar noRtnNode = !cast(node#"_noret_"#vt.Size); + ValueType data_vt = vt, bit isIntr = 0> { + defvar rtnNode = !cast(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); let AddedComplexity = 10 in { - def : FlatSignedAtomicPat (inst), noRtnNode, vt, data_vt>; - def : FlatSignedAtomicPat (inst#"_RTN"), rtnNode, vt, data_vt>; + defm : FlatSignedAtomicPat ; } let AddedComplexity = 11 in { @@ -1262,6 +1278,11 @@ } } +multiclass GlobalFLATAtomicIntrPats { + defm : GlobalFLATAtomicPats; +} + multiclass GlobalFLATNoRtnAtomicPats { def : FlatSignedAtomicPatNoRtn { @@ -1427,6 +1448,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } let OtherPredicates = [HasAtomicFaddInsts] in { @@ -1440,19 +1465,26 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; } let OtherPredicates = [isGFX940Plus] in { -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; +defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; } } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -319,10 +319,10 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) @@ -333,10 +333,10 @@ ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) @@ -347,10 +347,10 @@ ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -74,10 +74,10 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] offset:2048 glc ; GFX90A-NEXT: s_endpgm %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll @@ -23,13 +23,12 @@ ; G_GFX10-LABEL: global_atomic_fmin_f32_noret: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s4 -; G_GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX10-NEXT: global_atomic_fmin v0, v1, v0, s[2:3] glc ; G_GFX10-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) @@ -51,13 +50,12 @@ ; G_GFX10-LABEL: global_atomic_fmax_f32_noret: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s4 -; G_GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX10-NEXT: global_atomic_fmax v0, v1, v0, s[2:3] glc ; G_GFX10-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) @@ -120,12 +118,11 @@ ; G_GFX10-LABEL: global_atomic_fmin_f64_noret: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: v_mov_b32_e32 v3, s3 -; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; G_GFX10-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) @@ -146,12 +143,11 @@ ; G_GFX10-LABEL: global_atomic_fmax_f64_noret: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: v_mov_b32_e32 v3, s3 -; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; G_GFX10-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)