diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -403,12 +403,13 @@ template class BuilderBase; class SliceBuilder; + void rewriteVariablyIndexedLoad(Instruction &Root, LoadInst *LI); + Instruction &rewriteVariablyIndexedLoads(ArrayRef LIs); + friend class AllocaSlices::SliceBuilder; -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; -#endif /// The instruction responsible for this alloca not having a known set /// of slices. @@ -757,14 +758,18 @@ SmallDenseMap MemTransferSliceMap; SmallDenseMap PHIOrSelectSizes; + /// All `load`s with non-constant offsets. + SmallVectorImpl &VariablyIndexedLoads; + /// Set to de-duplicate dead instructions found in the use walk. SmallPtrSet VisitedDeadInsts; public: - SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) + SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS, + SmallVectorImpl &VariablyIndexedLoads_) : PtrUseVisitor(DL), AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()), - AS(AS) {} + AS(AS), VariablyIndexedLoads(VariablyIndexedLoads_) {} private: void markAsDead(Instruction &I) { @@ -881,17 +886,33 @@ insertUse(I, Offset, Size, IsSplittable); } + void handleVariablyIndexedLoad(Type *Ty, LoadInst &LI, uint64_t Size, + bool IsVolatile) { + if (IsVolatile) + return PI.setAborted(&LI); + // Profitability reasoning: we expect that for the largest legal int type, + // we do have good support for variable-amount shifts. For the type 2x that + // width, the legalization will expand the shift into, at worst, 3 shifts + // plus 5 supporting ALU ops. We expect that such an expansion is still not + // worse than failing to promote the alloca. + // But for any bit width larger than that, this isn't worth it. + uint64_t AllocaBitwidth = 8 * AllocSize; + if (unsigned MaxIntBitwidth = DL.getLargestLegalIntTypeSizeInBits(); + AllocaBitwidth > 2 * MaxIntBitwidth) + return PI.setAborted(&LI); + VariablyIndexedLoads.emplace_back(&LI); + } + void visitLoadInst(LoadInst &LI) { assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && "All simple FCA loads should have been pre-split"); - if (!IsOffsetKnown) - return PI.setAborted(&LI); - if (isa(LI.getType())) return PI.setAborted(&LI); uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize(); + if (!IsOffsetKnown) + return handleVariablyIndexedLoad(LI.getType(), LI, Size, LI.isVolatile()); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } @@ -1157,12 +1178,9 @@ }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) - : -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - AI(AI), -#endif - PointerEscapingInstr(nullptr) { - SliceBuilder PB(DL, AI, *this); + : AI(AI), PointerEscapingInstr(nullptr) { + SmallVector VariablyIndexedLoads; + SliceBuilder PB(DL, AI, *this, VariablyIndexedLoads); SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { // FIXME: We should sink the escape vs. abort info into the caller nicely, @@ -1173,6 +1191,14 @@ return; } + // Ok, if we are still here, then we can deal with everything we encountered. + + if (!VariablyIndexedLoads.empty()) { + Instruction &Root = rewriteVariablyIndexedLoads(VariablyIndexedLoads); + SliceBuilder::PtrInfo PtrI = PB.visitPtr(Root); + assert(!PtrI.isEscaped() && !PtrI.isAborted()); + } + llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); }); // Sort the uses. This arranges for the offsets to be in ascending order, @@ -1180,6 +1206,129 @@ llvm::stable_sort(Slices); } +// Given the load \p LI, how do we come up with it's address? +// Recurse until we reach the base `alloca`, remembering `Instruction` sequence. +static SmallVector getAddressCalculationStack(LoadInst *LI) { + SmallVector Stack; + Stack.emplace_back(LI->getPointerOperand()); + while (true) { + auto *I = dyn_cast(Stack.back()); + assert(I && I->getType()->isPointerTy() && "Not a ptr-to-ptr instruction"); + switch (I->getOpcode()) { + case Instruction::Alloca: + // Done recursing. + return Stack; + case Instruction::GetElementPtr: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + Stack.emplace_back(I->getOperand(0)); + break; + default: + // We don't allow `select`s/`PHI`s of variably-offset addresses, + // so we should not get here. + llvm_unreachable("Unexpected address-calculating instruction."); + } + } + return Stack; +} + +// Given the load \p LI address, produce an expression equivalent to the +// ptrtoint(address into alloca) - ptrtoint(alloca) +// but without referencing the alloca itself. +static Value *getVariableByteOffsetIntoAlloca(LoadInst *LI) { + SmallVector Stack = getAddressCalculationStack(LI); + auto *OrigRoot = cast(Stack.pop_back_val()); + Constant *NullRoot = ConstantPointerNull::get(OrigRoot->getType()); + + // Replicate the address-computation Instruction stack, + // but start off of `null` pointer. + Value *NewPtr = NullRoot; + while (!Stack.empty()) { + auto *OrigInstr = cast(Stack.pop_back_val()); + auto *NewInstr = OrigInstr->clone(); + NewInstr->setOperand(0, NewPtr); + NewInstr->insertAfter(OrigInstr); + if (auto *NewGEPI = dyn_cast(NewInstr)) + NewGEPI->setIsInBounds(false); + else { + // For non-GEP's, also "rebase" the root, so they maintain same type. + NullRoot = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + NullRoot, NewInstr->getType()); + } + NewPtr = NewInstr; + } + + // Finally, compute the byte distance between the "dummy" address and `null`. + IRBuilderTy Builder(LI); + Type *ByteTy = IntegerType::getInt8Ty(Builder.getContext()); + Type *BytePtrTy = ByteTy->getPointerTo(LI->getPointerAddressSpace()); + NewPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(NewPtr, BytePtrTy); + NullRoot = + ConstantExpr::getPointerBitCastOrAddrSpaceCast(NullRoot, BytePtrTy); + return Builder.CreatePtrDiff(ByteTy, NewPtr, NullRoot); +} + +// For each variably-indexed load, perform a wide load of the whole alloca +// (and now *that* we *can* promote), compute the byte offset into alloca +// from which we've originally loaded, and then use bit math to extract +// the equivalent bit sequence from the wide load. +void AllocaSlices::rewriteVariablyIndexedLoad(Instruction &Root, LoadInst *LI) { + const DataLayout &DL = LI->getModule()->getDataLayout(); + IRBuilderTy Builder(LI); + + Type *LoadTy = LI->getType(); + assert(!isa(LoadTy)); + + uint64_t AllocByteSize = + DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize(); + uint64_t AllocBitwidth = 8 * AllocByteSize; + + uint64_t LoadBitwidth = 8 * DL.getTypeStoreSize(LI->getType()).getFixedSize(); + + Type *LoadBitTy = IntegerType::get(Builder.getContext(), LoadBitwidth); + Type *AllocBitTy = IntegerType::get(Builder.getContext(), AllocBitwidth); + Type *AllocByteTy = FixedVectorType::get( + IntegerType::getInt8Ty(Builder.getContext()), AllocByteSize); + + Value *V = Builder.CreateAlignedLoad(AllocByteTy, &Root, AI.getAlign(), + AI.getName() + ".val"); + V = Builder.CreateFreeze(V, V->getName() + ".frozen"); + V = Builder.CreateBitCast(V, AllocBitTy, V->getName() + ".bits"); + + Value *VariableByteOffset = getVariableByteOffsetIntoAlloca(LI); + Value *Offset = Builder.CreateMul( + VariableByteOffset, ConstantInt::get(VariableByteOffset->getType(), 8), + VariableByteOffset->getName() + ".numbits"); + Offset = Builder.CreateZExtOrTrunc(Offset, AllocBitTy, + Offset->getName() + ".wide"); + + if (DL.isLittleEndian()) + V = Builder.CreateLShr(V, Offset, V->getName() + ".positioned"); // inexact. + else { + V = Builder.CreateShl(V, Offset, V->getName() + ".positioned"); // inexact. + V = Builder.CreateLShr( + V, ConstantInt::get(V->getType(), AllocBitwidth - LoadBitwidth), + V->getName() + ".part"); + } + V = Builder.CreateTrunc(V, LoadBitTy, V->getName() + ".extracted"); + V = Builder.CreateBitCast(V, LoadTy); + LI->replaceAllUsesWith(V); + DeadUsers.emplace_back(LI); +} + +Instruction & +AllocaSlices::rewriteVariablyIndexedLoads(ArrayRef LIs) { + // Create empty GEP for to base all our newly-inserted instructions off of, + // so we can feed it back into `SliceBuilder` to record our instructions. + Instruction &Root = *GetElementPtrInst::CreateInBounds( + IntegerType::getInt8Ty(AI.getContext()), &AI, {}); + Root.insertAfter(&AI); + // And just rewrite each `load` we previously recorded. + for (LoadInst *LI : LIs) + rewriteVariablyIndexedLoad(Root, LI); + return Root; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void AllocaSlices::print(raw_ostream &OS, const_iterator I, diff --git a/llvm/test/Transforms/SROA/widen-load-of-small-alloca.ll b/llvm/test/Transforms/SROA/widen-load-of-small-alloca.ll --- a/llvm/test/Transforms/SROA/widen-load-of-small-alloca.ll +++ b/llvm/test/Transforms/SROA/widen-load-of-small-alloca.ll @@ -9,14 +9,67 @@ ; RUN: opt -passes='sroa' -data-layout="E-n8:16:32" -S %s | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR,CHECK-SCALAR-32,CHECK-BE-32 define void @load-1byte-chunk-of-1byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-1byte-chunk-of-1byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [1 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <1 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-1byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <1 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <1 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i8 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i8 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <1 x i8> +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-1byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <1 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <1 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i8 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i8 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <1 x i8> +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-1byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <1 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <1 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i8 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i8 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <1 x i8> +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-1byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <1 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <1 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i8 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i8 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <1 x i8> +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [1 x i8], align 64 %init = load <1 x i8>, ptr %src, align 1 @@ -28,14 +81,71 @@ } define void @load-1byte-chunk-of-2byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-1byte-chunk-of-2byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [2 x i8], align 64 %init = load <2 x i8>, ptr %src, align 1 @@ -47,14 +157,67 @@ } define void @load-2byte-chunk-of-2byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-2byte-chunk-of-2byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [2 x i8], align 64 %init = load <2 x i8>, ptr %src, align 1 @@ -66,14 +229,71 @@ } define void @load-1byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-1byte-chunk-of-4byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 24 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 24 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [4 x i8], align 64 %init = load <4 x i8>, ptr %src, align 1 @@ -85,14 +305,71 @@ } define void @load-2byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-2byte-chunk-of-4byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [4 x i8], align 64 %init = load <4 x i8>, ptr %src, align 1 @@ -104,14 +381,67 @@ } define void @load-4byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-4byte-chunk-of-4byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[DOTNUMBITS_WIDE:%.*]] = trunc i64 [[DOTNUMBITS]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <4 x i8> +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [4 x i8], align 64 %init = load <4 x i8>, ptr %src, align 1 @@ -123,14 +453,67 @@ } define void @load-1byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-1byte-chunk-of-8byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 56 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 56 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [8 x i8], align 64 %init = load <8 x i8>, ptr %src, align 1 @@ -142,14 +525,67 @@ } define void @load-2byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-2byte-chunk-of-8byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [8 x i8], align 64 %init = load <8 x i8>, ptr %src, align 1 @@ -161,14 +597,67 @@ } define void @load-4byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-4byte-chunk-of-8byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [8 x i8], align 64 %init = load <8 x i8>, ptr %src, align 1 @@ -180,14 +669,63 @@ } define void @load-8byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-8byte-chunk-of-8byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v8i8(<8 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <8 x i8> +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <8 x i8> +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [8 x i8], align 64 %init = load <8 x i8>, ptr %src, align 1 @@ -199,14 +737,47 @@ } define void @load-1byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-1byte-chunk-of-16byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <1 x i8> +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 120 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <1 x i8> +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void ; %intermediate = alloca [16 x i8], align 64 %init = load <16 x i8>, ptr %src, align 1 @@ -218,14 +789,47 @@ } define void @load-2byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-2byte-chunk-of-16byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 112 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void ; %intermediate = alloca [16 x i8], align 64 %init = load <16 x i8>, ptr %src, align 1 @@ -237,14 +841,47 @@ } define void @load-4byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-4byte-chunk-of-16byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 96 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void ; %intermediate = alloca [16 x i8], align 64 %init = load <16 x i8>, ptr %src, align 1 @@ -256,14 +893,47 @@ } define void @load-8byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-8byte-chunk-of-16byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v8i8(<8 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i64 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <8 x i8> +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.v8i8(<8 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i64 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <8 x i8> +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void ; %intermediate = alloca [16 x i8], align 64 %init = load <16 x i8>, ptr %src, align 1 @@ -275,14 +945,45 @@ } define void @load-16byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-16byte-chunk-of-16byte-alloca( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v16i8(<16 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <16 x i8> +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[DOTNUMBITS_WIDE:%.*]] = zext i64 [[DOTNUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 0 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to <16 x i8> +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void ; %intermediate = alloca [16 x i8], align 64 %init = load <16 x i8>, ptr %src, align 1 @@ -410,14 +1111,67 @@ ;; Special test define void @load-2byte-chunk-of-8byte-alloca-with-2byte-step(ptr %src, i64 %byteOff) { -; CHECK-ALL-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( -; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 -; CHECK-ALL-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 -; CHECK-ALL-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 -; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i16, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] -; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 -; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) -; CHECK-ALL-NEXT: ret void +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-LE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-LE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-LE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-64-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-64-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-64-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr null, i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INIT]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-BE-32-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 0 +; CHECK-BE-32-NEXT: [[TMP4:%.*]] = sdiv exact i64 [[TMP3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +; CHECK-BE-32-NEXT: [[DOTNUMBITS:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[DOTNUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP5:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP5]]) +; CHECK-BE-32-NEXT: ret void ; %intermediate = alloca [8 x i8], align 64 %init = load <8 x i8>, ptr %src, align 1 @@ -496,10 +1250,5 @@ declare void @use.v16i8(<16 x i8>) declare void @use.v32i8(<32 x i8>) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-BE-32: {{.*}} -; CHECK-BE-64: {{.*}} -; CHECK-LE-32: {{.*}} -; CHECK-LE-64: {{.*}} ; CHECK-SCALAR: {{.*}} -; CHECK-SCALAR-32: {{.*}} ; CHECK-SCALAR-64: {{.*}}