diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1737,7 +1737,7 @@ defm : MUBUFScratchStorePat ; -let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -778,7 +778,7 @@ defm : DSAtomicWritePat_mc ; defm : DSAtomicWritePat_mc ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat ; def : DSWritePat ; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1027,10 +1027,12 @@ def : FlatStorePat ; def : FlatStorePat ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : FlatStorePat ; def : FlatStorePat ; +} +let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; @@ -1186,10 +1188,12 @@ defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +} +let OtherPredicates = [D16PreservesUnusedBits] in { defm : GlobalFLATLoadPats_D16 ; defm : GlobalFLATLoadPats_D16 ; defm : GlobalFLATLoadPats_D16 ; @@ -1291,10 +1295,12 @@ defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; -let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; +} +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATLoadPats_D16 ; defm : ScratchFLATLoadPats_D16 ; defm : ScratchFLATLoadPats_D16 ; diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s +; RxN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s @@ -10,7 +10,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -30,7 +29,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -50,7 +48,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -69,7 +66,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -89,7 +85,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -110,9 +105,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { @@ -134,9 +126,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { @@ -157,9 +146,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { @@ -181,9 +167,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { @@ -291,9 +274,6 @@ ; GCN: s_waitcnt ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 - ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 @@ -318,9 +298,6 @@ ; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v -; GFX906-DAG: v_lshrrev_b32_e32 -; GFX906: flat_store_short v[0:1], v2{{$}} - ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt @@ -343,9 +320,6 @@ ; GFX803-DAG: v_addc_u32_e32 ; GFX803: flat_store_byte v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { @@ -369,9 +343,6 @@ ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} -; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906: flat_store_byte v[0:1], v2{{$}} - ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}}