Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -1854,7 +1854,12 @@ On every specification that takes a ``:``, specifying the ```` alignment is optional. If omitted, the preceding ``:`` -should be omitted too and ```` will be equal to ````. +should be omitted too and ```` will be equal to ````. The +type ``>`` alignment is the default assumed alignment for a load +or store unless otherwise specified. The preferred alignment may be +lower than the ABI alignment. If the preferred alignment is lower than +the ABI alignment, it should only be used in cases where the pointer +value is not captured unless otherwise known to be safe. When constructing the data layout for a given target, LLVM starts with a default set of specifications which are then (possibly) overridden by @@ -1872,6 +1877,8 @@ - ``i32:32:32`` - i32 is 32-bit aligned - ``i64:32:64`` - i64 has ABI alignment of 32-bits but preferred alignment of 64-bits +- ``i64:64:32`` - i64 has an ABI alignment of 64-bits but preferred + alignment of 32-bits - ``f16:16:16`` - half is 16-bit aligned - ``f32:32:32`` - float is 32-bit aligned - ``f64:64:64`` - double is 64-bit aligned Index: include/llvm/IR/DataLayout.h =================================================================== --- include/llvm/IR/DataLayout.h +++ include/llvm/IR/DataLayout.h @@ -419,19 +419,24 @@ return 8 * getTypeAllocSize(Ty); } - /// \brief Returns the minimum ABI-required alignment for the specified type. + /// \brief Returns the minimum ABI-required alignment for allocas and globals + /// with the specified type. unsigned getABITypeAlignment(Type *Ty) const; /// \brief Returns the minimum ABI-required alignment for an integer type of /// the specified bitwidth. unsigned getABIIntegerTypeAlignment(unsigned BitWidth) const; - /// \brief Returns the preferred stack/global alignment for the specified - /// type. - /// - /// This is always at least as good as the ABI alignment. + /// \brief Returns the preferred stack alignment for the specified type. This + /// may be less than the ABI alignment. unsigned getPrefTypeAlignment(Type *Ty) const; + /// \return The preferred alignment that is also suitable as the ABI + /// alignment. + unsigned getPrefABITypeAlignment(Type *Ty) const { + return std::max(getPrefTypeAlignment(Ty), getABITypeAlignment(Ty)); + } + /// \brief Returns the preferred alignment for the specified type, returned as /// log2 of the value (a shift amount). unsigned getPreferredTypeAlignmentShift(Type *Ty) const; Index: lib/Analysis/Loads.cpp =================================================================== --- lib/Analysis/Loads.cpp +++ lib/Analysis/Loads.cpp @@ -233,8 +233,8 @@ // know the size of the base type and the loaded type to do anything in this // case. if (BaseType && BaseType->isSized()) { - if (BaseAlign == 0) - BaseAlign = DL.getPrefTypeAlignment(BaseType); + if (BaseAlign == 0) // Seems to be unreachable? + BaseAlign = DL.getABITypeAlignment(BaseType); if (Align <= BaseAlign) { // Check if the load is within the bounds of the underlying object. Index: lib/CodeGen/AtomicExpandPass.cpp =================================================================== --- lib/CodeGen/AtomicExpandPass.cpp +++ lib/CodeGen/AtomicExpandPass.cpp @@ -1464,7 +1464,7 @@ bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); - unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); + unsigned AllocaAlignment = DL.getPrefABITypeAlignment(SizedIntTy); // TODO: the "order" argument type is "int", not int32. So // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1309,10 +1309,12 @@ unsigned char TargetFlags) { assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); - if (Alignment == 0) - Alignment = MF->getFunction()->optForSize() - ? getDataLayout().getABITypeAlignment(C->getType()) - : getDataLayout().getPrefTypeAlignment(C->getType()); + if (Alignment == 0) { + const DataLayout &DL = getDataLayout(); + unsigned PrefAlign = DL.getPrefTypeAlignment(C->getType()); + Alignment = MF->getFunction()->optForSize() ? + std::min(DL.getABITypeAlignment(C->getType()), PrefAlign) : PrefAlign; + } unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; FoldingSetNodeID ID; AddNodeIDNode(ID, Opc, getVTList(VT), None); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7705,7 +7705,7 @@ // assert(!CS.hasInAllocaArgument() && // "sret demotion is incompatible with inalloca"); uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy); - unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy); + unsigned Align = DL.getPrefABITypeAlignment(CLI.RetTy); MachineFunction &MF = CLI.DAG.getMachineFunction(); DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false); Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy); Index: lib/IR/DataLayout.cpp =================================================================== --- lib/IR/DataLayout.cpp +++ lib/IR/DataLayout.cpp @@ -101,7 +101,6 @@ LayoutAlignElem LayoutAlignElem::get(AlignTypeEnum align_type, unsigned abi_align, unsigned pref_align, uint32_t bit_width) { - assert(abi_align <= pref_align && "Preferred alignment worse than ABI!"); LayoutAlignElem retval; retval.AlignType = align_type; retval.ABIAlign = abi_align; @@ -128,7 +127,6 @@ PointerAlignElem PointerAlignElem::get(uint32_t AddressSpace, unsigned ABIAlign, unsigned PrefAlign, uint32_t TypeByteWidth) { - assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!"); PointerAlignElem retval; retval.AddressSpace = AddressSpace; retval.ABIAlign = ABIAlign; @@ -422,10 +420,6 @@ if (pref_align != 0 && !isPowerOf2_64(pref_align)) report_fatal_error("Invalid preferred alignment, must be a power of 2"); - if (pref_align < abi_align) - report_fatal_error( - "Preferred alignment cannot be less than the ABI alignment"); - for (LayoutAlignElem &Elem : Alignments) { if (Elem.AlignType == (unsigned)align_type && Elem.TypeBitWidth == bit_width) { @@ -451,10 +445,6 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign, unsigned PrefAlign, uint32_t TypeByteWidth) { - if (PrefAlign < ABIAlign) - report_fatal_error( - "Preferred alignment cannot be less than the ABI alignment"); - PointersTy::iterator I = findPointerLowerBound(AddrSpace); if (I == Pointers.end() || I->AddressSpace != AddrSpace) { Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign, Index: lib/IR/Value.cpp =================================================================== --- lib/IR/Value.cpp +++ lib/IR/Value.cpp @@ -630,7 +630,7 @@ if (Align == 0) { Type *AllocatedType = AI->getAllocatedType(); if (AllocatedType->isSized()) - Align = DL.getPrefTypeAlignment(AllocatedType); + Align = DL.getPrefABITypeAlignment(AllocatedType); } } else if (auto CS = ImmutableCallSite(this)) Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -159,15 +159,15 @@ static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. - return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64:32-v96:128:32" + "-v192:256:32-v256:256:32-v512:512:32-v1024:1024:32-v2048:2048:32-n32:64"; } // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" - "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + return "e-p:32:32-p1:64:64:32-p2:64:64:32-p3:32:32-p4:64:64:32-p5:32:32" + "-i64:64:32-v16:16-v24:32-v32:32-v48:64:32-v96:128:32" + "-v192:256:32-v256:256:32-v512:512:32-v1024:1024:32-v2048:2048:32-n32:64"; } LLVM_READNONE Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -231,7 +231,7 @@ if (AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) - AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType())); + AI.setAlignment(DL.getPrefABITypeAlignment(AI.getAllocatedType())); // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should @@ -1188,11 +1188,12 @@ return eraseInstFromFunction(SI); // Attempt to improve the alignment. + unsigned PrefAlign = DL.getPrefTypeAlignment(Val->getType()); unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT); + Ptr, PrefAlign, DL, &SI, &AC, &DT); unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = - StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + unsigned EffectiveStoreAlign = StoreAlign != 0 ? + StoreAlign : std::min(PrefAlign, DL.getABITypeAlignment(Val->getType())); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); Index: lib/Transforms/Instrumentation/EfficiencySanitizer.cpp =================================================================== --- lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -694,7 +694,7 @@ // Convert 0 to the default alignment. if (Alignment == 0) - Alignment = DL.getPrefTypeAlignment(OrigTy); + Alignment = DL.getABITypeAlignment(OrigTy); if (IsStore) NumInstrumentedStores++; Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -751,23 +751,22 @@ // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { - IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); - unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); - - if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment && - getKnownAlignment(RHS, DL, CI) >= PrefAlignment) { - + unsigned PrefAlign = DL.getPrefTypeAlignment(IntType); + unsigned LHSAlign, RHSAlign; + if ((LHSAlign = getKnownAlignment(LHS, DL, CI)) >= PrefAlign && + (RHSAlign = getKnownAlignment(LHS, DL, CI)) >= PrefAlign) { Type *LHSPtrTy = - IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); + IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); Type *RHSPtrTy = - IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); + IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); Value *LHSV = - B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv"); + B.CreateAlignedLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), + LHSAlign, "lhsv"); Value *RHSV = - B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv"); - + B.CreateAlignedLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), + RHSAlign, "rhsv"); return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); } } Index: test/Assembler/datalayout-preferred-alignent-less-abi-alignment-i32.ll =================================================================== --- /dev/null +++ test/Assembler/datalayout-preferred-alignent-less-abi-alignment-i32.ll @@ -0,0 +1,3 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +target datalayout = "p:32:32:16" +; CHECK: target datalayout = "p:32:32:16" Index: test/Assembler/datalayout-preferred-alignent-less-size-i64.ll =================================================================== --- /dev/null +++ test/Assembler/datalayout-preferred-alignent-less-size-i64.ll @@ -0,0 +1,3 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +target datalayout = "i64:64:16" +; CHECK: target datalayout = "i64:64:16" Index: test/Assembler/invalid-datalayout14.ll =================================================================== --- test/Assembler/invalid-datalayout14.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "i64:64:16" -; CHECK: Preferred alignment cannot be less than the ABI alignment Index: test/Assembler/invalid-datalayout18.ll =================================================================== --- test/Assembler/invalid-datalayout18.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "p:32:32:16" -; CHECK: Preferred alignment cannot be less than the ABI alignment Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -388,8 +388,7 @@ ret void } -; FIXME: Should be able to do without stack access. The used stack -; space is also 2x what should be required. +; FIXME: Should be able to do without stack access. ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: ; GCN: SCRATCH_RSRC_DWORD @@ -410,7 +409,7 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 64 +; GCN: ScratchSize: 36 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b @@ -438,11 +437,26 @@ ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 128 +; GCN: ScratchSize: 68 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 ret void } +; GCN-LABEL: {{^}}dynamic_insertelement_v16f64: +; ScratchSize: 132 +define void @dynamic_insertelement_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, i32 %b) nounwind { + %vecins = insertelement <16 x double> %a, double 8.0, i32 %b + store <16 x double> %vecins, <16 x double> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}dynamic_insertelement_v32f64: +; ScratchSize: 260 +define void @dynamic_insertelement_v32f64(<32 x double> addrspace(1)* %out, <32 x double> %a, i32 %b) nounwind { + %vecins = insertelement <32 x double> %a, double 8.0, i32 %b + store <32 x double> %vecins, <32 x double> addrspace(1)* %out, align 4 + ret void +} declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone Index: test/Transforms/InstCombine/low-preferred-alignment.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/low-preferred-alignment.ll @@ -0,0 +1,188 @@ +; RUN: opt -S -instcombine %s | FileCheck %s + +target datalayout = "p:64:64:32-i64:64:32-a0:32-n32:64" + +%unsized = type {} + +; CHECK: @lds_align8 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 8 +; CHECK: @lds_align4 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 4 +; CHECK: @lds_align1 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 4 +; CHECK: @lds_noalign = internal unnamed_addr addrspace(3) global [256 x i64] undef +; CHECK: @extern_lds_align8 = unnamed_addr addrspace(3) global [256 x i64] undef, align 8 +; CHECK: @extern_lds_align4 = unnamed_addr addrspace(3) global [256 x i64] undef, align 4 +; CHECK: @extern_lds_align1 = unnamed_addr addrspace(3) global [256 x i64] undef, align 1 +; CHECK: @extern_lds_noalign = unnamed_addr addrspace(3) global [256 x i64] undef +; CHECK: @const_array_i64 = addrspace(2) constant [12 x i64] [i64 2, i64 9, i64 4, i64 22, i64 2, i64 9, i64 4, i64 22, i64 2, i64 9, i64 4, i64 22] + +declare void @use.i64(i64) +declare void @use.p0i64(i64* align 1) +declare void @use.p0.ptr(%unsized* align 1) +declare void @use.p3i64(i64 addrspace(3)* align 1) +declare void @llvm.memcpy.p0i8.p2i8.i64(i8*, i8 addrspace(2)* %s, i64, i32, i1) + +; The alloca must have at least the ABI alignment in case it is passed to a call +; CHECK-LABEL: @adjust_alloca_align_i64( +; CHECK: %alloca = alloca i64, align 8 +define void @adjust_alloca_align_i64() { + %alloca = alloca i64 + call void @use.p0i64(i64* %alloca) + ret void +} + +; CHECK-LABEL: @adjust_alloca_align_size0( +; CHECK: %alloca = alloca %unsized, align 4 +define void @adjust_alloca_align_size0() { + %alloca = alloca %unsized + call void @use.p0.ptr(%unsized* %alloca) + ret void +} +; CHECK-LABEL @store_alloca_i64( +; CHECK: %alloca0 = alloca i64, align 8 +; CHECK: %alloca1 = alloca i64, align 4 +; CHECK: %alloca4 = alloca i64, align 4 +; CHECK: %alloca8 = alloca i64, align 8 + +; CHECK: store i64 123, i64* %alloca0, align 8 +; CHECK: store i64 123, i64* %alloca1, align 4 +; CHECK: store i64 123, i64* %alloca4, align 4 +; CHECK: store i64 123, i64* %alloca8, align 8 +define void @store_alloca_i64() { + %alloca0 = alloca i64 + %alloca1 = alloca i64, align 1 + %alloca4 = alloca i64, align 4 + %alloca8 = alloca i64, align 8 + + store i64 123, i64* %alloca0 + call void @use.p0i64(i64* %alloca0) + + store i64 123, i64* %alloca1, align 1 + call void @use.p0i64(i64* %alloca1) + + store i64 123, i64* %alloca4 + call void @use.p0i64(i64* %alloca4) + + store i64 123, i64* %alloca8 + call void @use.p0i64(i64* %alloca8) + + ret void +} + +@lds_align8 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 8 +@lds_align4 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 4 +@lds_align1 = internal unnamed_addr addrspace(3) global [256 x i64] undef, align 1 +@lds_noalign = internal unnamed_addr addrspace(3) global [256 x i64] undef + +@extern_lds_align8 = unnamed_addr addrspace(3) global [256 x i64] undef, align 8 +@extern_lds_align4 = unnamed_addr addrspace(3) global [256 x i64] undef, align 4 +@extern_lds_align1 = unnamed_addr addrspace(3) global [256 x i64] undef, align 1 +@extern_lds_noalign = unnamed_addr addrspace(3) global [256 x i64] undef + +; CHECK-LABEL: @store_global_i64( +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align8, i64 0, i64 0), align 8 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align4, i64 0, i64 0), align 4 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align1, i64 0, i64 0), align 4 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_noalign, i64 0, i64 0), align 16 + +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align8, i64 0, i64 0), align 8 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align4, i64 0, i64 0), align 4 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align1, i64 0, i64 0), align 1 +; CHECK: store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_noalign, i64 0, i64 0), align 16 + +define void @store_global_i64() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align8, i64 0, i64 0) + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align8, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align4, i64 0, i64 0), align 4 + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align4, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align1, i64 0, i64 0), align 1 + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align1, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_noalign, i64 0, i64 0) + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_noalign, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align8, i64 0, i64 0) + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align8, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align4, i64 0, i64 0), align 4 + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align4, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align1, i64 0, i64 0), align 1 + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align1, i64 0, i64 0)) + + store i64 123, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_noalign, i64 0, i64 0) + call void @use.p3i64(i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_noalign, i64 0, i64 0)) + + ret void +} + +; CHECK-LABEL: @load_global_i64( +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align8, i64 0, i64 0), align 8 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align4, i64 0, i64 0), align 4 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align1, i64 0, i64 0), align 4 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_noalign, i64 0, i64 0), align 16 + +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align8, i64 0, i64 0), align 8 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align4, i64 0, i64 0), align 4 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align1, i64 0, i64 0), align 1 +; CHECK: load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_noalign, i64 0, i64 0), align 16 +define void @load_global_i64() { + %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align8, i64 0, i64 0) + call void @use.i64(i64 %val0) + + %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align4, i64 0, i64 0), align 4 + call void @use.i64(i64 %val1) + + %val2 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_align1, i64 0, i64 0), align 1 + call void @use.i64(i64 %val2) + + %val3 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @lds_noalign, i64 0, i64 0) + call void @use.i64(i64 %val3) + + %val4 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align8, i64 0, i64 0) + call void @use.i64(i64 %val4) + + %val5 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align4, i64 0, i64 0), align 4 + call void @use.i64(i64 %val5) + + %val6 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_align1, i64 0, i64 0), align 1 + call void @use.i64(i64 %val6) + + %val7 = load i64, i64 addrspace(3)* getelementptr inbounds ([256 x i64], [256 x i64] addrspace(3)* @extern_lds_noalign, i64 0, i64 0) + call void @use.i64(i64 %val7) + + ret void +} + +@const_array_i64 = addrspace(2) constant [12 x i64] [i64 2, i64 9, i64 4, i64 22, i64 2, i64 9, i64 4, i64 22, i64 2, i64 9, i64 4, i64 22] + +; Must use ABI alignment, may increase to 8 +; CHECK-LABEL: @memcpy_from_global_align_i64( +; CHECK: %alloca = alloca [12 x i64], align 8 +; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %cast.alloca, i8 addrspace(2)* bitcast ([12 x i64] addrspace(2)* @const_array_i64 to i8 addrspace(2)*), i64 96, i32 8, i1 false) +define void @memcpy_from_global_align_i64() { + %alloca = alloca [12 x i64] + %cast.alloca = bitcast [12 x i64]* %alloca to i8* + %cast.const = bitcast [12 x i64] addrspace(2)* @const_array_i64 to i8 addrspace(2)* + call void @llvm.memcpy.p0i8.p2i8.i64(i8* %cast.alloca, i8 addrspace(2)* %cast.const, i64 96, i32 1, i1 false) + %cast.alloca.1 = bitcast [12 x i64]* %alloca to i64* + call void @use.p0i64(i64* %cast.alloca.1) + ret void +} + +declare i32 @memcmp(i8*, i8*, i32) + +; CHECK-LABEL: @test_simplify_memcpy_lowalign( +; CHECK: %cmp = icmp eq i64 %x, %y +; CHECK-NEXT: ret i1 %cmp +define i1 @test_simplify_memcpy_lowalign(i64 %x, i64 %y) { + %x.addr = alloca i64, align 4 + %y.addr = alloca i64, align 4 + store i64 %x, i64* %x.addr, align 4 + store i64 %y, i64* %y.addr, align 4 + %xptr = bitcast i64* %x.addr to i8* + %yptr = bitcast i64* %y.addr to i8* + %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 8) + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +}