Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -105,6 +105,20 @@ "Support unaligned scratch loads and stores" >; +// LDS alignment enforcement is controlled by a configuration register: +// SH_MEM_CONFIG.alignment_mode +def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode", + "UnalignedAccessMode", + "true", + "Support unaligned local and region loads and stores" +>; + +def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access", + "UnalignedDSAccess", + "true", + "Does not requires 16 byte alignment for certain local and region loads and stores" +>; + def FeatureApertureRegs : SubtargetFeature<"aperture-regs", "HasApertureRegs", "true", @@ -695,7 +709,7 @@ FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureFastDenormalF32 + FeatureFastDenormalF32, FeatureUnalignedDSAccess ] >; @@ -713,7 +727,8 @@ FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, - FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16 + FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, + FeatureUnalignedDSAccess ] >; Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -315,6 +315,7 @@ bool CodeObjectV3; bool UnalignedScratchAccess; bool UnalignedBufferAccess; + bool UnalignedAccessMode; bool HasApertureRegs; bool EnableXNACK; bool DoesNotSupportXNACK; @@ -394,6 +395,7 @@ bool HasMFMAInlineLiteralBug; bool HasVertexCache; short TexVTXClauseSize; + bool UnalignedDSAccess; bool ScalarizeGlobal; bool HasVcmpxPermlaneHazard; @@ -699,6 +701,14 @@ return UnalignedScratchAccess; } + bool hasUnalignedAccessMode() const { + return UnalignedAccessMode; + } + + bool hasUnalignedDSAccess() const { + return UnalignedDSAccess; + } + bool hasApertureRegs() const { return HasApertureRegs; } Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -187,6 +187,7 @@ CodeObjectV3(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), + UnalignedAccessMode(false), HasApertureRegs(false), EnableXNACK(false), @@ -257,6 +258,7 @@ HasUnpackedD16VMem(false), LDSMisalignedBug(false), HasMFMAInlineLiteralBug(false), + UnalignedDSAccess(false), ScalarizeGlobal(false), Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -89,6 +89,7 @@ AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedBufferAccess, AMDGPU::FeatureUnalignedScratchAccess, + AMDGPU::FeatureUnalignedAccessMode, AMDGPU::FeatureAutoWaitcntBeforeBarrier, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1363,14 +1363,44 @@ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - bool AlignedBy4 = (Align % 4 == 0); - if (IsFast) - *IsFast = AlignedBy4; + // Check if alignment requirements for ds_read/write instructions are + // disabled. + if (Subtarget->hasUnalignedDSAccess() && + Subtarget->hasUnalignedAccessMode()) { + if (IsFast) + *IsFast = true; + return true; + } - return AlignedBy4; + if (Size == 64) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + + return AlignedBy4; + } + if (Size == 96) { + // ds_read/write_b96 require 16-byte alignment on gfx8 and older. + bool Aligned = + Align % (Subtarget->hasUnalignedDSAccess() ? 4 : 16) == 0; + if (IsFast) + *IsFast = Aligned; + + return Aligned; + } + if (Size == 128) { + // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we + // can do a 8 byte aligned, 16 byte access in a single operation using + // ds_read2/write2_b64. + bool Aligned = Align % (Subtarget->hasUnalignedDSAccess() ? 4 : 8) == 0; + if (IsFast) + *IsFast = Aligned; + + return Aligned; + } } // FIXME: We have to be conservative here and assume that flat operations @@ -1386,7 +1416,9 @@ return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccess()) { + if (Subtarget->hasUnalignedBufferAccess() && + !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS)) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -62,8 +62,7 @@ ; GFX7-DS128-LABEL: name: load_local_v4s32_align_4 ; GFX7-DS128: liveins: $vgpr0 ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) ; GFX9-LABEL: name: load_local_v4s32_align_4 ; GFX9: liveins: $vgpr0 @@ -100,8 +99,7 @@ ; GFX7-DS128-LABEL: name: load_local_v2s64 ; GFX7-DS128: liveins: $vgpr0 ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; GFX9-LABEL: name: load_local_v2s64 ; GFX9: liveins: $vgpr0 @@ -138,8 +136,7 @@ ; GFX7-DS128-LABEL: name: load_local_v2p1 ; GFX7-DS128: liveins: $vgpr0 ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: [[LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) ; GFX9-LABEL: name: load_local_v2p1 ; GFX9: liveins: $vgpr0 @@ -176,8 +173,7 @@ ; GFX7-DS128-LABEL: name: load_local_s128 ; GFX7-DS128: liveins: $vgpr0 ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) ; GFX9-LABEL: name: load_local_s128 ; GFX9: liveins: $vgpr0 @@ -214,8 +210,7 @@ ; GFX7-DS128-LABEL: name: load_local_v8s16 ; GFX7-DS128: liveins: $vgpr0 ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) ; GFX9-LABEL: name: load_local_v8s16 ; GFX9: liveins: $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -1669,13 +1669,25 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align8 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; VI-LABEL: name: test_load_local_s96_align8 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; GFX9-LABEL: name: test_load_local_s96_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1717,13 +1729,25 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; VI-LABEL: name: test_load_local_s96_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; GFX9-LABEL: name: test_load_local_s96_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -2960,13 +2984,33 @@ ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; CI-DS128-LABEL: name: test_load_local_s128_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; VI-LABEL: name: test_load_local_s128_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; VI: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; VI: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; GFX9-LABEL: name: test_load_local_s128_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -7873,12 +7917,24 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; CI-DS128-LABEL: name: test_load_local_v3s32_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; VI-LABEL: name: test_load_local_v3s32_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; GFX9-LABEL: name: test_load_local_v3s32_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) @@ -7990,12 +8046,32 @@ ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; CI-DS128-LABEL: name: test_load_local_v4s32_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; VI-LABEL: name: test_load_local_v4s32_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_load_local_v4s32_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) @@ -8897,12 +8973,20 @@ ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-DS128-LABEL: name: test_load_local_v2s64_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64) + ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_local_v2s64_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_local_v2s64_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) @@ -9693,12 +9777,20 @@ ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; CI-DS128-LABEL: name: test_load_local_v2p1_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) + ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; VI-LABEL: name: test_load_local_v2p1_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-LABEL: name: test_load_local_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) @@ -11410,28 +11502,50 @@ ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; CI-DS128-LABEL: name: test_extload_local_v2s96_from_24_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-DS128: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; CI-DS128: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) - ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) - ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY1]], [[LOAD]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; CI-DS128: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD2]](<2 x s32>), 0 + ; CI-DS128: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; CI-DS128: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; VI-LABEL: name: test_extload_local_v2s96_from_24_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; VI: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY1]], [[LOAD]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; VI: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD2]](<2 x s32>), 0 + ; VI: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; VI: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; VI: [[COPY3:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) @@ -11510,8 +11624,14 @@ ; CI-DS128: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-DS128: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; CI-DS128: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 20, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 + ; CI-DS128: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -11522,8 +11642,14 @@ ; VI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; VI: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 20, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 + ; VI: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) Index: llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; There is no dependence between the store and the two loads. So we can combine the loads @@ -6,8 +6,8 @@ ; GCN-LABEL: {{^}}ds_combine_nodep -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 -; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) { %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -473,8 +473,8 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -505,8 +505,8 @@ ; CI: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} -; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in Index: llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -526,7 +526,8 @@ } ; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> , <4 x i32> addrspace(3)* +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %1, align 4 +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %2, align 4 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 Index: llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basic-aa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -basic-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -basic-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -6,10 +7,10 @@ ; for a the same head starting a chain. @0 = internal addrspace(3) global [16384 x i32] undef -; CHECK-LABEL: @no_crash( -; CHECK: store <2 x i32> zeroinitializer -; CHECK: store i32 0 -; CHECK: store i32 0 +; GCN-LABEL: @no_crash( +; GCN: store <2 x i32> zeroinitializer +; GCN: store i32 0 +; GCN: store i32 0 define amdgpu_kernel void @no_crash(i32 %arg) { %tmp2 = add i32 %arg, 14 @@ -28,13 +29,22 @@ ; Check adjiacent memory locations are properly matched and the ; longest chain vectorized -; CHECK-LABEL: @interleave_get_longest -; CHECK: load <4 x i32> -; CHECK: load i32 -; CHECK: store <2 x i32> zeroinitializer -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 +; GCN-LABEL: @interleave_get_longest + +; GFX7: load <2 x i32> +; GFX7: load i32 +; GFX7: store <2 x i32> zeroinitializer +; GFX7: load i32 +; GFX7: load <2 x i32> +; GFX7: load i32 +; GFX7: load i32 + +; GFX9: load <4 x i32> +; GFX9: load i32 +; GFX9: store <2 x i32> zeroinitializer +; GFX9: load i32 +; GFX9: load i32 +; GFX9: load i32 define amdgpu_kernel void @interleave_get_longest(i32 %arg) { %a1 = add i32 %arg, 1