Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3464,10 +3464,6 @@ return false; } -static bool isDwordAligned(unsigned Alignment) { - return Alignment % 4 == 0; -} - //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -5385,21 +5381,23 @@ AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent()) + if (!Op->isDivergent() && Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - isDwordAligned(Alignment)) + Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -72,6 +72,18 @@ ret void } +; FUNC-LABEL: {{^}}constant_load_v16i16_align2: +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: flat_store_dwordx4 +define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 { +entry: + %ld = load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2 + store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32 + ret void +} + ; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32: ; GCN-NOHSA: buffer_load_ushort ; GCN-NOHSA: buffer_store_dword Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -83,6 +83,18 @@ ret void } +; GCN-LABEL: {{^}}global_load_v16i16_align2: +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: flat_store_dwordx4 +define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %in, <16 x i16> addrspace(1)* %out) #0 { +entry: + %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in, align 2 + store <16 x i16> %ld, <16 x i16> addrspace(1)* %out, align 32 + ret void +} + ; FUNC-LABEL: {{^}}global_zextload_i16_to_i32: ; GCN-NOHSA: buffer_load_ushort ; GCN-NOHSA: buffer_store_dword