diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1343,7 +1343,8 @@ def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, - AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>; + AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC), + (not FeatureGFX90AInsts))>; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -591,7 +591,9 @@ } bool d16PreservesUnusedBits() const { - return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); + // gfx90a's d16 loads don't preserve unused bits + return hasD16LoadStore() && !TargetID.isSramEccOnOrAny() && + !hasGFX90AInsts(); } bool hasD16Images() const { diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-sroa=0 -mattr=-promote-alloca,-sramecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sramecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906PLUS,GFX9,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-sroa=0 -mattr=-promote-alloca,-sramecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906PLUS,GFX9,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s @@ -10,7 +11,7 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906PLUS-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -30,7 +31,7 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906PLUS-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -50,7 +51,7 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906PLUS-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -69,7 +70,7 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906PLUS-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -89,7 +90,7 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906PLUS-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -110,8 +111,8 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -134,8 +135,8 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -157,8 +158,8 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -181,8 +182,8 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -291,8 +292,8 @@ ; GCN: s_waitcnt ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: flat_store_short v[0:1], v2 offset:4094 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 @@ -318,8 +319,8 @@ ; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v -; GFX906-DAG: v_lshrrev_b32_e32 -; GFX906: flat_store_short v[0:1], v2{{$}} +; GFX906PLUS-DAG: v_lshrrev_b32_e32 +; GFX906PLUS: flat_store_short v[0:1], v2{{$}} ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} @@ -343,8 +344,8 @@ ; GFX803-DAG: v_addc_u32_e32 ; GFX803: flat_store_byte v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} +; GFX906PLUS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -369,8 +370,8 @@ ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} -; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906: flat_store_byte v[0:1], v2{{$}} +; GFX906PLUS-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906PLUS: flat_store_byte v[0:1], v2{{$}} ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}}