Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -122,14 +122,17 @@ VectorType *VT = dyn_cast(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, + KernArgBaseAlign); Value *ArgPtr; - if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types + if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. @@ -147,7 +150,7 @@ } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( KernArgSegment, - AlignDownOffset, + EltOffset, Arg.getName() + ".kernarg.offset"); ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); @@ -198,7 +201,7 @@ // TODO: Convert noalias arg to !noalias - if (Size < 32 && !ArgTy->isAggregateType()) { + if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -739,10 +739,10 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -789,10 +789,18 @@ ; FIXME: Why not all scalar loads? ; GCN-LABEL: {{^}}array_3xi16: ; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef ret void } + +; GCN-LABEL: {{^}}small_array_round_down_offset: +; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1 +define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { + %val = extractvalue [1 x i8] %arg, 0 + store volatile i8 %val, i8 addrspace(1)* undef + ret void +}