Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1203,9 +1203,13 @@ const SISubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); HSAMD::Kernel::CodeProps::Metadata HSACodeProps; + const Function &F = MF.getFunction(); - HSACodeProps.mKernargSegmentSize = - STM.getKernArgSegmentSize(MF.getFunction(), MFI.getExplicitKernArgSize()); + // Avoid asserting on erroneous cases. + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + return HSACodeProps; + + HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F); HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; HSACodeProps.mKernargSegmentAlign = Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -74,31 +74,11 @@ BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&*EntryBlock.begin()); - SmallVector ArgTypes; - for (Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - unsigned Size = DL.getTypeStoreSizeInBits(ArgTy); - bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && - !ST.isAmdHsaOS(); - - // Clover seems to always pad i8/i16 to i32, but doesn't properly align - // them? - // Make sure the struct elements have correct size and alignment for ext - // args. These seem to be padded up to 4-bytes but not correctly aligned. - ArgTypes.push_back( - IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType()); - } - - StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName()); - const StructLayout *Layout = DL.getStructLayout(ArgStructTy); - - // Minimum alignment for kern segment is 16. - unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy)); + const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); // FIXME: Alignment is broken broken with explicit arg offset.; - const uint64_t TotalKernArgSize = BaseOffset + - ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy)); + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F); if (TotalKernArgSize == 0) return false; @@ -109,23 +89,34 @@ KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); - KernArgSegment->addAttribute(AttributeList::ReturnIndex, - Attribute::getWithAlignment(Ctx, KernArgBaseAlign)); - - Value *KernArgBase = KernArgSegment; - if (BaseOffset != 0) { - KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset); - KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset); - } unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); - Value *CastStruct = Builder.CreateBitCast(KernArgBase, - ArgStructTy->getPointerTo(AS)); + unsigned MaxAlign = 1; + uint64_t ExplicitArgOffset = 0; + for (Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned Align = DL.getABITypeAlignment(ArgTy); + MaxAlign = std::max(Align, MaxAlign); + unsigned Size = DL.getTypeSizeInBits(ArgTy); + unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + + + // Clover seems to always pad i8/i16 to i32, but doesn't properly align + // them? + // Make sure the struct elements have correct size and alignment for ext + // args. These seem to be padded up to 4-bytes but not correctly aligned. + bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && + !ST.isAmdHsaOS(); + if (IsExtArg) + AllocSize = 4; + + uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + if (Arg.use_empty()) continue; - Type *ArgTy = Arg.getType(); if (PointerType *PT = dyn_cast(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We @@ -145,10 +136,6 @@ bool IsV3 = VT && VT->getNumElements() == 3; VectorType *V4Ty = nullptr; - unsigned Size = DL.getTypeSizeInBits(ArgTy); - bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && - !ST.isAmdHsaOS(); - int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo()); int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); @@ -162,19 +149,24 @@ // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. - ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset); - ArgPtr = Builder.CreateBitCast( - ArgPtr, - Builder.getInt32Ty()->getPointerTo(AS), + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + KernArgSegment, + AlignDownOffset, Arg.getName() + ".kernarg.offset.align.down"); + ArgPtr = Builder.CreateBitCast(ArgPtr, + Builder.getInt32Ty()->getPointerTo(AS), + ArgPtr->getName() + ".cast"); } else { - ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(), - Arg.getName() + ".kernarg.offset"); + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + KernArgSegment, + AlignDownOffset, + Arg.getName() + ".kernarg.offset"); + ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), + ArgPtr->getName() + ".cast"); } assert((!IsExtArg || !IsV3) && "incompatible situation"); - if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads @@ -254,6 +246,10 @@ } } + KernArgSegment->addAttribute( + AttributeList::ReturnIndex, + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + return true; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -857,8 +857,9 @@ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; } + uint64_t getExplicitKernArgSize(const Function &F) const; unsigned getKernArgSegmentSize(const Function &F, - unsigned ExplictArgBytes) const; + int64_t ExplicitArgBytes = -1) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -411,11 +411,31 @@ return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } +uint64_t SISubtarget::getExplicitKernArgSize(const Function &F) const { + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL); + + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t ExplicitArgBytes = 0; + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + + unsigned Align = DL.getABITypeAlignment(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; + } + + return ExplicitArgBytes; +} + unsigned SISubtarget::getKernArgSegmentSize(const Function &F, - unsigned ExplicitArgBytes) const { - uint64_t TotalSize = ExplicitArgBytes; - unsigned ImplicitBytes = getImplicitArgNumBytes(F); + int64_t ExplicitArgBytes) const { + if (ExplicitArgBytes == -1) + ExplicitArgBytes = getExplicitKernArgSize(F); + unsigned ExplicitOffset = getExplicitKernelArgOffset(F); + + uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; + unsigned ImplicitBytes = getImplicitArgNumBytes(F); if (ImplicitBytes != 0) { unsigned Alignment = getAlignmentForImplicitArgPtr(); TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -301,12 +301,9 @@ ret void } -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - ; FUNC-LABEL: {{^}}ctpop_i32_in_br: -; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16 -; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58 +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[RESULT]], Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -228,6 +228,17 @@ ret void } +; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding: +; HSA: kernarg_segment_byte_size = 120 +; MESA: kernarg_segment_byte_size = 84 +; GCN: kernarg_segment_alignment = 6 +define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 { + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %load = load volatile i32, i32 addrspace(4)* %cast + ret void +} + declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 Index: test/CodeGen/AMDGPU/lower-kernargs.ll =================================================================== --- test/CodeGen/AMDGPU/lower-kernargs.ll +++ test/CodeGen/AMDGPU/lower-kernargs.ll @@ -16,23 +16,20 @@ define amdgpu_kernel void @kern_i8(i8 %arg) #0 { ; HSA-LABEL: @kern_i8( ; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]] to [[KERN_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i8( ; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i8 %arg, i8 addrspace(1)* undef, align 1 @@ -42,23 +39,20 @@ define amdgpu_kernel void @kern_i16(i16 %arg) #0 { ; HSA-LABEL: @kern_i16( ; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]] to [[KERN_I16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 -; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; HSA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i16( ; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; MESA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i16 %arg, i16 addrspace(1)* undef, align 1 @@ -68,24 +62,21 @@ define amdgpu_kernel void @kern_f16(half %arg) #0 { ; HSA-LABEL: @kern_f16( ; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]] to [[KERN_F16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 -; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP4]] to half +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half ; HSA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_f16( ; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP5]] to half +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half ; MESA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; @@ -96,23 +87,20 @@ define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 { ; HSA-LABEL: @kern_zeroext_i8( ; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_zeroext_i8( ; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !1, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !1, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i8 %arg, i8 addrspace(1)* undef, align 1 @@ -122,23 +110,20 @@ define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 { ; HSA-LABEL: @kern_zeroext_i16( ; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 -; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; HSA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_zeroext_i16( ; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !2, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !2, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; MESA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i16 %arg, i16 addrspace(1)* undef, align 1 @@ -148,23 +133,20 @@ define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 { ; HSA-LABEL: @kern_signext_i8( ; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_signext_i8( ; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !3, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !3, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i8 %arg, i8 addrspace(1)* undef, align 1 @@ -174,23 +156,20 @@ define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 { ; HSA-LABEL: @kern_signext_i16( ; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 -; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; HSA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_signext_i16( ; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; MESA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store i16 %arg, i16 addrspace(1)* undef, align 1 @@ -200,35 +179,32 @@ define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) { ; HSA-LABEL: @kern_i8_i8( ; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]] to [[KERN_I8_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef, align 1 -; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1 +; HSA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i8_i8( ; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1 -; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef, align 1 @@ -239,24 +215,21 @@ define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) { ; HSA-LABEL: @kern_v3i8( ; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]] to [[KERN_V3I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 -; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8> +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24 +; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> ; HSA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_v3i8( ; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24 -; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP5]] to <3 x i8> +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24 +; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> ; MESA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4 ; MESA-NEXT: ret void ; @@ -267,23 +240,20 @@ define amdgpu_kernel void @kern_i24(i24 %arg0) { ; HSA-LABEL: @kern_i24( ; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]] to [[KERN_I24:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 -; HSA-NEXT: store i24 [[TMP4]], i24 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24 +; HSA-NEXT: store i24 [[TMP2]], i24 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i24( ; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I24:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24 -; MESA-NEXT: store i24 [[TMP5]], i24 addrspace(1)* undef +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24 +; MESA-NEXT: store i24 [[TMP2]], i24 addrspace(1)* undef ; MESA-NEXT: ret void ; store i24 %arg0, i24 addrspace(1)* undef @@ -293,18 +263,17 @@ define amdgpu_kernel void @kern_i32(i32 %arg0) { ; HSA-LABEL: @kern_i32( ; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]] to [[KERN_I32:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i32( ; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -315,18 +284,17 @@ define amdgpu_kernel void @kern_f32(float %arg0) { ; HSA-LABEL: @kern_f32( ; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]] to [[KERN_F32:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_f32( ; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F32:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -337,22 +305,21 @@ define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) { ; HSA-LABEL: @kern_v3i32( ; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]] to [[KERN_V3I32:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <3 x i32> addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> ; HSA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_v3i32( ; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I32:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <3 x i32> addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> ; MESA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4 ; MESA-NEXT: ret void ; @@ -360,30 +327,94 @@ ret void } +define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 { +; HSA-LABEL: @kern_v8i32( +; HSA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)* +; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_v8i32( +; MESA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)* +; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef +; MESA-NEXT: ret void +; + store <8 x i32> %arg, <8 x i32> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 { +; HSA-LABEL: @kern_v8i64( +; HSA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)* +; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_v8i64( +; MESA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(100) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)* +; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef +; MESA-NEXT: ret void +; + store <8 x i64> %arg, <8 x i64> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 { +; HSA-LABEL: @kern_v16i64( +; HSA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(128) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)* +; HSA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_v16i64( +; MESA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(164) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)* +; MESA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef +; MESA-NEXT: ret void +; + store <16 x i64> %arg, <16 x i64> addrspace(1)* undef + ret void +} + define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) { ; HSA-LABEL: @kern_i32_v3i32( ; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]] to [[KERN_I32_V3I32:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 1 -; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <3 x i32> addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> ; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; HSA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_i32_v3i32( ; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32_V3I32:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 1 -; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <3 x i32> addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> ; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; MESA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4 ; MESA-NEXT: ret void @@ -399,18 +430,17 @@ define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) { ; HSA-LABEL: @kern_struct_a( ; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]] to [[KERN_STRUCT_A:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_struct_a( ; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_A:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -421,18 +451,17 @@ define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 { ; HSA-LABEL: @kern_struct_b_packed( ; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_struct_b_packed( ; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -443,18 +472,17 @@ define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 { ; HSA-LABEL: @kern_implicit_arg_num_bytes( ; HSA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_implicit_arg_num_bytes( -; MESA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -462,21 +490,41 @@ ret void } +define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 { +; HSA-LABEL: @kernel_implicitarg_no_struct_align( +; HSA-NEXT: [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef +; HSA-NEXT: ret void +; +; MESA-LABEL: @kernel_implicitarg_no_struct_align( +; MESA-NEXT: [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef +; MESA-NEXT: ret void +; + store i32 %arg1, i32 addrspace(1)* undef + ret void +} + define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 { ; HSA-LABEL: @kern_lds_ptr( ; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]] to [[KERN_LDS_PTR:%.*]] addrspace(4)* -; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)* +; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_lds_ptr( ; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR:%.*]] addrspace(4)* -; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)* +; MESA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4 ; MESA-NEXT: ret void ; @@ -487,14 +535,11 @@ define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 { ; HSA-LABEL: @kern_lds_ptr_si( ; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)* ; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_lds_ptr_si( ; MESA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)* ; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4 ; MESA-NEXT: ret void ; @@ -505,35 +550,32 @@ define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 { ; HSA-LABEL: @kern_realign_i8_i8( ; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_i8( ; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef @@ -544,47 +586,44 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 { ; HSA-LABEL: @kern_realign_i8_i8_i8( ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 -; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef ; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_i8_i8( ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 -; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 -; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef @@ -596,59 +635,56 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 { ; HSA-LABEL: @kern_realign_i8_i8_i8_i8( ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 -; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 -; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 -; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8 +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef ; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP11]], i8 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_i8_i8_i8( ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 -; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 -; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 -; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 -; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP11]], i8 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef @@ -661,34 +697,31 @@ define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 { ; HSA-LABEL: @kern_realign_i8_v3i8( ; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i24 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP7]] to <3 x i8> -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8> +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; HSA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_v3i8( ; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i24 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP8]] to <3 x i8> -; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8> +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -700,35 +733,32 @@ define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 { ; HSA-LABEL: @kern_realign_i8_i16( ; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i16 [[TMP5]], i16 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_i16( ; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 -; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i16 [[TMP5]], i16 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef @@ -739,35 +769,32 @@ define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 { ; HSA-LABEL: @kern_realign_i1_i1( ; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 -; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef -; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_i1( ; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 +; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef ; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i1 %arg0, i1 addrspace(1)* undef @@ -778,47 +805,44 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 { ; HSA-LABEL: @kern_realign_i1_i1_i1( ; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 -; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 -; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1 -; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef ; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef -; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_i1_i1( ; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 -; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 -; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 +; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef ; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i1 %arg0, i1 addrspace(1)* undef @@ -830,59 +854,56 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 { ; HSA-LABEL: @kern_realign_i1_i1_i1_i1( ; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 -; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 -; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1 -; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 -; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i1 -; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1 +; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef ; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef -; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef -; HSA-NEXT: store volatile i1 [[TMP16]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i1 [[TMP11]], i1 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_i1_i1_i1( ; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1 -; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 -; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1 -; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 -; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i1 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1 +; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef ; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i1 [[TMP17]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i1 [[TMP11]], i1 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i1 %arg0, i1 addrspace(1)* undef @@ -895,34 +916,31 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 { ; HSA-LABEL: @kern_realign_i1_v3i1( ; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP7]] to <3 x i1> -; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1> +; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef ; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_v3i1( ; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i3 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP8]] to <3 x i1> -; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1> +; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef ; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -934,35 +952,32 @@ define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 { ; HSA-LABEL: @kern_realign_i1_i16( ; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 -; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef -; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef +; HSA-NEXT: store volatile i16 [[TMP5]], i16 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_i16( ; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 -; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef -; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef +; MESA-NEXT: store volatile i16 [[TMP5]], i16 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i1 %arg0, i1 addrspace(1)* undef @@ -973,95 +988,92 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 { ; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8( ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16 -; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 -; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24 -; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8 -; HSA-NEXT: [[TMP17:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 -; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP17]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 8 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8 +; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP13:%.*]] = lshr i32 [[TMP12]], 8 +; HSA-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8 +; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 16 +; HSA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 24 ; HSA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8 -; HSA-NEXT: [[TMP21:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 -; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP21]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP22:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 16 -; HSA-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i8 -; HSA-NEXT: [[TMP25:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4 -; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP25]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP26:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP26]], 24 -; HSA-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i8 -; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef ; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP11]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP14]], i8 addrspace(1)* undef +; HSA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef ; HSA-NEXT: store volatile i8 [[TMP20]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP24]], i8 addrspace(1)* undef -; HSA-NEXT: store volatile i8 [[TMP28]], i8 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8( ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 -; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16 -; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8 -; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 +; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24 +; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8 +; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[TMP13:%.*]] = lshr i32 [[TMP12]], 8 +; MESA-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8 +; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 16 ; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -; MESA-NEXT: [[TMP18:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 -; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP18]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP19:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 8 -; MESA-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 -; MESA-NEXT: [[TMP22:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 -; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP22]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP23:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP24:%.*]] = lshr i32 [[TMP23]], 16 -; MESA-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i8 -; MESA-NEXT: [[TMP26:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4 -; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP26]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP27:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP28:%.*]] = lshr i32 [[TMP27]], 24 -; MESA-NEXT: [[TMP29:%.*]] = trunc i32 [[TMP28]] to i8 +; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 24 +; MESA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8 +; MESA-NEXT: store volatile i8 [[TMP2]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP11]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP14]], i8 addrspace(1)* undef ; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP21]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP25]], i8 addrspace(1)* undef -; MESA-NEXT: store volatile i8 [[TMP29]], i8 addrspace(1)* undef +; MESA-NEXT: store volatile i8 [[TMP20]], i8 addrspace(1)* undef ; MESA-NEXT: ret void ; store volatile i8 %arg0, i8 addrspace(1)* undef @@ -1077,37 +1089,34 @@ define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 { ; HSA-LABEL: @kern_realign_f16_f16( ; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)* -; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 -; HSA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP4]] to half -; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 -; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16 -; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP8]] to half +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; HSA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half ; HSA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef ; HSA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_f16_f16( ; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; MESA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; MESA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP5]] to half -; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0 -; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16 -; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP9]] to half +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half ; MESA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef ; MESA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef ; MESA-NEXT: ret void @@ -1120,18 +1129,17 @@ define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 { ; HSA-LABEL: @kern_global_ptr( ; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)* -; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_global_ptr( ; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)* -; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1142,18 +1150,17 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 { ; HSA-LABEL: @kern_global_ptr_dereferencable( ; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)* -; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1 +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1 ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_global_ptr_dereferencable( ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)* -; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !5 +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !5 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1164,18 +1171,17 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 { ; HSA-LABEL: @kern_global_ptr_dereferencable_or_null( ; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)* -; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2 +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2 ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_global_ptr_dereferencable_or_null( ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)* -; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !6 +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !6 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1186,18 +1192,17 @@ define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 { ; HSA-LABEL: @kern_nonnull_global_ptr( ; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)* -; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0 ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_nonnull_global_ptr( ; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)* -; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0 +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1208,18 +1213,17 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 { ; HSA-LABEL: @kern_align32_global_ptr( ; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)* -; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3 +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3 ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_align32_global_ptr( ; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)* -; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0 -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !7 +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !7 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1230,14 +1234,11 @@ define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 { ; HSA-LABEL: @kern_noalias_global_ptr( ; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)* ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_noalias_global_ptr( ; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)* ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1248,15 +1249,12 @@ define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 { ; HSA-LABEL: @kern_noalias_global_ptr_x2( ; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)* ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_noalias_global_ptr_x2( ; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)* ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void @@ -1270,9 +1268,9 @@ ; HSA-LABEL: @struct_i8_i8_arg( ; HSA-NEXT: entry: ; HSA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]] to [[STRUCT_I8_I8_ARG:%.*]] addrspace(4)* -; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I8_ARG]], [[STRUCT_I8_I8_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0 -; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)* +; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1 ; HSA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1282,10 +1280,9 @@ ; MESA-LABEL: @struct_i8_i8_arg( ; MESA-NEXT: entry: ; MESA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[STRUCT_I8_I8_ARG:%.*]] addrspace(4)* -; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I8_ARG]], [[STRUCT_I8_I8_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)* +; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1 ; MESA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1304,9 +1301,9 @@ ; HSA-LABEL: @struct_i8_i16_arg( ; HSA-NEXT: entry: ; HSA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]] to [[STRUCT_I8_I16_ARG:%.*]] addrspace(4)* -; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I16_ARG]], [[STRUCT_I8_I16_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0 -; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)* +; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1 ; HSA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1316,10 +1313,9 @@ ; MESA-LABEL: @struct_i8_i16_arg( ; MESA-NEXT: entry: ; MESA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[STRUCT_I8_I16_ARG:%.*]] addrspace(4)* -; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I16_ARG]], [[STRUCT_I8_I16_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)* +; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1 ; MESA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1338,9 +1334,9 @@ ; HSA-LABEL: @array_2xi8_arg( ; HSA-NEXT: entry: ; HSA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]] to [[ARRAY_2XI8_ARG:%.*]] addrspace(4)* -; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI8_ARG]], [[ARRAY_2XI8_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0 -; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)* +; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1 ; HSA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1350,10 +1346,9 @@ ; MESA-LABEL: @array_2xi8_arg( ; MESA-NEXT: entry: ; MESA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[ARRAY_2XI8_ARG:%.*]] addrspace(4)* -; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI8_ARG]], [[ARRAY_2XI8_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)* +; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1 ; MESA-NEXT: store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4 @@ -1372,9 +1367,9 @@ ; HSA-LABEL: @array_2xi1_arg( ; HSA-NEXT: entry: ; HSA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]] to [[ARRAY_2XI1_ARG:%.*]] addrspace(4)* -; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI1_ARG]], [[ARRAY_2XI1_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0 -; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)* +; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1 ; HSA-NEXT: store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4 @@ -1384,10 +1379,9 @@ ; MESA-LABEL: @array_2xi1_arg( ; MESA-NEXT: entry: ; MESA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[ARRAY_2XI1_ARG:%.*]] addrspace(4)* -; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI1_ARG]], [[ARRAY_2XI1_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0 -; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)* +; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1 ; MESA-NEXT: store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4 @@ -1408,8 +1402,6 @@ ; ; MESA-LABEL: @only_empty_struct( ; MESA-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[ONLY_EMPTY_STRUCT:%.*]] addrspace(4)* ; MESA-NEXT: ret void ; ret void @@ -1418,18 +1410,17 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 { ; HSA-LABEL: @empty_struct_with_other( ; HSA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]] to [[EMPTY_STRUCT_WITH_OTHER:%.*]] addrspace(4)* -; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[EMPTY_STRUCT_WITH_OTHER]], [[EMPTY_STRUCT_WITH_OTHER]] addrspace(4)* [[TMP1]], i32 0, i32 1 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load !0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef ; HSA-NEXT: ret void ; ; MESA-LABEL: @empty_struct_with_other( ; MESA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36 -; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[EMPTY_STRUCT_WITH_OTHER:%.*]] addrspace(4)* -; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[EMPTY_STRUCT_WITH_OTHER]], [[EMPTY_STRUCT_WITH_OTHER]] addrspace(4)* [[TMP2]], i32 0, i32 1 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load !0 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef ; MESA-NEXT: ret void ; Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -70,14 +70,14 @@ ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: ; %Flow5 +; GCN: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN: ; %exit1 ; GCN: ds_write_b32 -; GCN: %Flow6 +; GCN: %Flow5 ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN-NEXT: s_and_saveexec_b64 Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -38,14 +38,14 @@ ; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]] ; SI: ; %bb.1: ; %else -; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 -; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e +; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e +; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x37 ; SI-NOT: add ; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; SI: [[IF]]: ; %if -; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c +; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 ; SI-NOT: add ; SI: [[ENDIF]]: ; %endif Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -14,7 +14,7 @@ ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] ; SI-NEXT: s_cbranch_execz [[FLOW_BB]] -; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock4 +; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 ; SI: s_and_saveexec_b64