diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -713,39 +713,6 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; -// Prefer ds_read over ds_read2, all other things being equal, because it has -// a larger immediate offset range. -let AddedComplexity = 100 in { - -foreach vt = VReg_64.RegTypes in { -defm : DSReadPat_mc ; -} - -let SubtargetPredicate = isGFX7Plus in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc ; -} - -// For performance reasons restrict this to alignment >= 16 even with -// unaligned-access-mode. At lower alignments ds_read2_b64 is always a better -// choice. -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc ; -} - -let SubtargetPredicate = HasUnalignedAccessMode in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc ; -} - -} // End SubtargetPredicate = HasUnalignedAccessMode - -} // End SubtargetPredicate = isGFX7Plus - -} // End AddedComplexity = 100 - let OtherPredicates = [D16PreservesUnusedBits] in { def : DSReadPat_D16; def : DSReadPat_D16; @@ -869,33 +836,51 @@ defm : DS128Bit8ByteAlignedPat_mc; } -// Prefer ds_write over ds_write2, all other things being equal, because it has -// a larger immediate offset range. +// Prefer ds_read over ds_read2 and ds_write over ds_write2, all other things +// being equal, because it has a larger immediate offset range. let AddedComplexity = 100 in { +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc ; +} + foreach vt = VReg_64.RegTypes in { defm : DSWritePat_mc ; } let SubtargetPredicate = isGFX7Plus in { +foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc ; +} + foreach vt = VReg_96.RegTypes in { defm : DSWritePat_mc ; } -// For performance reasons restrict this to alignment >= 16 even with -// unaligned-access-mode. At lower alignments ds_write2_b64 is always a better -// choice. +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc ; +} + foreach vt = VReg_128.RegTypes in { defm : DSWritePat_mc ; } let SubtargetPredicate = HasUnalignedAccessMode in { +// FIXME: Is ds_read_b96/ds_write_b96 better choice in unaligned-access-mode? +foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc ; +} + foreach vt = VReg_96.RegTypes in { defm : DSWritePat_mc ; } +// For performance reasons, *do not* select ds_read_b128/ds_write_b128 in +// unaligned-access-mode. At lower alignments ds_read2_b64/ds_write2_b64 is +// always a better choice. + } // End SubtargetPredicate = HasUnalignedAccessMode } // End SubtargetPredicate = isGFX7Plus diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1412,10 +1412,15 @@ return true; } + // Either, the alignment requirements are "enabled", or there is an + // unaligned LDS access related hardware bug though alignment requirements + // are "disabled". In either case, we need to check for proper alignment + // requirements. + // if (Size == 64) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. + // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we + // can do a 4 byte aligned, 8 byte access in a single operation using + // ds_read2/write2_b32 with adjacent offsets. bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; @@ -1423,22 +1428,22 @@ return AlignedBy4; } if (Size == 96) { - // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = Alignment >= Align(16); + // 12 byte accessing via ds_read/write_b96 require 16-byte alignment. + bool AlignedBy16 = Alignment >= Align(16); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy16; - return Aligned; + return AlignedBy16; } if (Size == 128) { - // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we - // can do a 8 byte aligned, 16 byte access in a single operation using + // 16 byte accessing via ds_read/write_b128 require 16-byte alignment, but + // we can do a 8 byte aligned, 16 byte access in a single operation using // ds_read2/write2_b64. - bool Aligned = Alignment >= Align(8); + bool AlignedBy8 = Alignment >= Align(8); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy8; - return Aligned; + return AlignedBy8; } }