Index: llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -14,17 +14,9 @@ using namespace llvm; -static unsigned getLoopOperandSizeInBytes(Type *Type) { - if (VectorType *VTy = dyn_cast(Type)) { - return VTy->getBitWidth() / 8; - } - - return Type->getPrimitiveSizeInBits() / 8; -} - void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, - unsigned SrcAlign, unsigned DestAlign, + unsigned SrcAlign, unsigned DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { // No need to expand zero length copies. @@ -35,15 +27,16 @@ BasicBlock *PostLoopBB = nullptr; Function *ParentFunc = PreLoopBB->getParent(); LLVMContext &Ctx = PreLoopBB->getContext(); + const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); Type *TypeOfCopyLen = CopyLen->getType(); - Type *LoopOpType = - TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DestAlign); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, + SrcAlign, DstAlign); - unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; if (LoopEndCount != 0) { @@ -66,16 +59,21 @@ DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); } + unsigned PartSize = DL.getTypeStoreSize(LoopOpType); + Align PartDstAlign(MinAlign(DstAlign, PartSize)); + Align PartSrcAlign(MinAlign(SrcAlign, PartSize)); + IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); // Loop Body Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); - Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); + Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, + PartSrcAlign, SrcIsVolatile); Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); - LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); @@ -93,18 +91,16 @@ IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() : InsertBefore); - // Update the alignment based on the copy size used in the loop body. - SrcAlign = std::min(SrcAlign, LoopOpSize); - DestAlign = std::min(DestAlign, LoopOpSize); - SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, - SrcAS, DstAS, - SrcAlign, DestAlign); + SrcAS, DstAS, SrcAlign, DstAlign); for (auto OpTy : RemainingOps) { + Align PartSrcAlign(MinAlign(SrcAlign, BytesCopied)); + Align PartDstAlign(MinAlign(DstAlign, BytesCopied)); + // Calaculate the new index - unsigned OperandSize = getLoopOperandSizeInBytes(OpTy); + unsigned OperandSize = DL.getTypeStoreSize(OpTy); uint64_t GepIndex = BytesCopied / OperandSize; assert(GepIndex * OperandSize == BytesCopied && "Division should have no Remainder!"); @@ -115,7 +111,8 @@ : RBuilder.CreateBitCast(SrcAddr, SrcPtrType); Value *SrcGEP = RBuilder.CreateInBoundsGEP( OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex)); - Value *Load = RBuilder.CreateLoad(OpTy, SrcGEP, SrcIsVolatile); + Value *Load = + RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile); // Cast destination to operand type and store. PointerType *DstPtrType = PointerType::get(OpTy, DstAS); @@ -124,7 +121,7 @@ : RBuilder.CreateBitCast(DstAddr, DstPtrType); Value *DstGEP = RBuilder.CreateInBoundsGEP( OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex)); - RBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); BytesCopied += OperandSize; } @@ -136,7 +133,7 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, unsigned SrcAlign, - unsigned DestAlign, bool SrcIsVolatile, + unsigned DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { BasicBlock *PreLoopBB = InsertBefore->getParent(); @@ -144,13 +141,14 @@ PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); LLVMContext &Ctx = PreLoopBB->getContext(); unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); - Type *LoopOpType = - TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DestAlign); - unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, + SrcAlign, DstAlign); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); @@ -178,13 +176,17 @@ BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); IRBuilder<> LoopBuilder(LoopBB); + Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize)); + Align PartDstAlign(MinAlign(DstAlign, LoopOpSize)); + PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); - Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); + Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign, + SrcIsVolatile); Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); - LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); @@ -235,10 +237,11 @@ Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); - Value *Load = ResBuilder.CreateLoad(Int8Type, SrcGEP, SrcIsVolatile); + Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign, + SrcIsVolatile); Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); - ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); Value *ResNewIndex = ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); @@ -285,13 +288,14 @@ // } // return dst; // } -static void createMemMoveLoop(Instruction *InsertBefore, - Value *SrcAddr, Value *DstAddr, Value *CopyLen, - unsigned SrcAlign, unsigned DestAlign, - bool SrcIsVolatile, bool DstIsVolatile) { +static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, unsigned SrcAlign, + unsigned DstAlign, bool SrcIsVolatile, + bool DstIsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); Type *EltTy = cast(SrcAddr->getType())->getElementType(); @@ -319,6 +323,10 @@ BasicBlock *ExitBB = InsertBefore->getParent(); ExitBB->setName("memmove_done"); + unsigned PartSize = DL.getTypeStoreSize(EltTy); + Align PartSrcAlign(MinAlign(SrcAlign, PartSize)); + Align PartDstAlign(MinAlign(DstAlign, PartSize)); + // Initial comparison of n == 0 that lets us skip the loops altogether. Shared // between both backwards and forward copy clauses. ICmpInst *CompareN = @@ -332,11 +340,12 @@ PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); Value *IndexPtr = LoopBuilder.CreateSub( LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); - Value *Element = LoopBuilder.CreateLoad( + Value *Element = LoopBuilder.CreateAlignedLoad( EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr), - "element"); - LoopBuilder.CreateStore( - Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr)); + PartSrcAlign, "element"); + LoopBuilder.CreateAlignedStore( + Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr), + PartDstAlign); LoopBuilder.CreateCondBr( LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), ExitBB, LoopBB); @@ -350,11 +359,11 @@ BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB); IRBuilder<> FwdLoopBuilder(FwdLoopBB); PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); - Value *FwdElement = FwdLoopBuilder.CreateLoad( - EltTy, FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi), - "element"); - FwdLoopBuilder.CreateStore( - FwdElement, FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi)); + Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi); + Value *FwdElement = + FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element"); + Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi); + FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign); Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), @@ -366,12 +375,13 @@ ElseTerm->eraseFromParent(); } -static void createMemSetLoop(Instruction *InsertBefore, - Value *DstAddr, Value *CopyLen, Value *SetValue, - unsigned Align, bool IsVolatile) { +static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, + Value *CopyLen, Value *SetValue, unsigned DstAlign, + bool IsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split"); BasicBlock *LoopBB @@ -389,14 +399,17 @@ LoopBB); OrigBB->getTerminator()->eraseFromParent(); + unsigned PartSize = DL.getTypeStoreSize(SetValue->getType()); + Align PartAlign(MinAlign(DstAlign, PartSize)); + IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); - LoopBuilder.CreateStore( + LoopBuilder.CreateAlignedStore( SetValue, LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), - IsVolatile); + PartAlign, IsVolatile); Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); Index: llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -24,9 +24,9 @@ ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]] +; OPT-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 1 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]] +; OPT-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]], align 1 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1025 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] @@ -57,9 +57,9 @@ ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ] ; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]] -; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]] -; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]] +; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] ; OPT: copy_forward: @@ -67,9 +67,9 @@ ; OPT: copy_forward_loop: ; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]] +; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]] -; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]] +; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1 ; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] @@ -95,7 +95,7 @@ ; OPT: loadstoreloop: ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] -; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 ; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025 ; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] @@ -113,9 +113,9 @@ ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] +; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] @@ -133,9 +133,9 @@ ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] +; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] @@ -153,9 +153,9 @@ ; OPT: loop-memcpy-expansion2: ; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX3]] -; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX3]] -; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] +; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX3]], 1 ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2]], label [[POST_LOOP_MEMCPY_EXPANSION1]] @@ -165,9 +165,9 @@ ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP11:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP9:%.*]] = load i8, i8 addrspace(1)* [[TMP8]] +; OPT-NEXT: [[TMP9:%.*]] = load i8, i8 addrspace(1)* [[TMP8]], align 1 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST1:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP9]], i8 addrspace(1)* [[TMP10]] +; OPT-NEXT: store i8 [[TMP9]], i8 addrspace(1)* [[TMP10]], align 1 ; OPT-NEXT: [[TMP11]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP12:%.*]] = icmp ult i64 [[TMP11]], [[M]] ; OPT-NEXT: br i1 [[TMP12]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] @@ -186,9 +186,9 @@ ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(3)* [[TMP2]] +; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(3)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] +; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1 ; OPT-NEXT: [[TMP5]] = add i32 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[N]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] @@ -207,9 +207,9 @@ ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] +; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] +; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]