diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h --- a/llvm/include/llvm/Transforms/Scalar/SROA.h +++ b/llvm/include/llvm/Transforms/Scalar/SROA.h @@ -131,6 +131,10 @@ void clobberUse(Use &U); bool deleteDeadInstructions(SmallPtrSetImpl &DeletedAllocas); bool promoteAllocas(Function &F); + + bool scalarConstantStoresEquivalentToWholeUsage( + SmallVector &BBPtrToFurtherAnalyze, sroa::AllocaSlices &AS, + uint64_t AllocaSize); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -116,6 +116,107 @@ cl::Hidden); namespace { +// The access pattern for an allocated type. +// They are used as bit position to mask / unmask a bit map. +// +// Usage: +// The bit positions are supposed to be set by instruction walkers, and read +// by alloca-instruction rewriters as necessary. For instance, function +// `shouldSplitAllocaPerAccessBitMap` reads the masked positions to determine +// whether to proceed with scalarization or not. +// +// Examples inline. +// +// Context: +// struct Foo {int, int} +// Foo foo_generator() { +// return {1, rand()}; +// } +// +// int arr[5]; +enum class AllocaAccessKind { + // The allocated type is stored as a whole, and value might be constant or + // dynamic. + // + // Foo f = foo_generator(); + // + // memset(arr, 0, sizeof(int) * 5); + StoreAsAWhole = 0, + // The allocated type is written into with a constant through a scalar type. + // + // Foo f; + // f.x = 1; + // + // a[0] = 2; + ConstantStoreByScalar = 1, + + // The structure type is stored via sub-fields, and the value is not contant. + // + // scalars in `f` is read. + // Foo f; + // f.x = rand(); + // + // arr[2] = rand(); + NonConstantStoreByScalar = 2, + // The allocated type is read as a whole. + // + // f is read as a whole. + // Foo f = {2, 3}; + // Foo another = f; + // + // a is read as a whole. + // int b[5]; + // + // + ReadAsAWhole = 3, + // The structure type is read as a scalar. + // + // scalars in `f` is read. + + // Foo f; + // int tmp = f.y; + + // int element = arr[1]; + ReadByScalar = 4, + // + // The usage of alloca might be partial or full, but its instruction type is + // not covered by the access pattern analysis in AllocaSlices::SliceBuilder. + // + // Examples include: + // - memcpy + // - memmove + // - volatile operations + OtherUsage = 5, + + // The number of different access types, used to define the length of bitmap. + // When more types are added above, AccessKindCount should be adjusted + // accordingly. + AccessKindCount = 6, +}; + +// A helper function to cast a enum value to an integer. +inline uint16_t accessTypeToBitMapIndex(AllocaAccessKind ap) { + uint64_t ret = static_cast(ap); + assert(ret < static_cast(AllocaAccessKind::AccessKindCount)); + return ret; +} + +inline uint16_t accessTypeBitMapLength() { + return static_cast(AllocaAccessKind::AccessKindCount); +} + +// Returns true if accesss bit map indicates the alloca usage should be split; +// return false otherwise. +bool shouldSplitAllocaPerAccessBitMap(const APInt allocAccessBitMap) { + return // If there are scalar reads or non-constant stores, return true. + (allocAccessBitMap[accessTypeToBitMapIndex( + AllocaAccessKind::ReadByScalar)] || + allocAccessBitMap[accessTypeToBitMapIndex( + AllocaAccessKind::NonConstantStoreByScalar)] || + // If there are uncovered access patterns, returns true conservatively. + allocAccessBitMap[accessTypeToBitMapIndex( + AllocaAccessKind::OtherUsage)]); +} /// A custom IRBuilder inserter which prefixes all names, but only in /// Assert builds. @@ -260,6 +361,20 @@ std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } + void insertAllocAccessMap( + const SmallDenseMap &allocAccessMap) { + for (const auto &kv : allocAccessMap) { + BasicBlock *BBPtr = kv.first; + APInt incrementalAccess = kv.second; + auto iter = allocAccessBitMapPerBB.find(BBPtr); + if (iter != allocAccessBitMapPerBB.end()) { + iter->second |= incrementalAccess; + } else { + allocAccessBitMapPerBB.insert(std::make_pair(BBPtr, incrementalAccess)); + } + } + } + // Forward declare the iterator and range accessor for walking the // partitions. class partition_iterator; @@ -298,6 +413,9 @@ friend class AllocaSlices::SliceBuilder; + // The size of this alloca instruction. + const uint64_t AllocaSize; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; @@ -311,14 +429,6 @@ /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; - /// The slices of the alloca. - /// - /// We store a vector of the slices formed by uses of the alloca here. This - /// vector is sorted by increasing begin offset, and then the unsplittable - /// slices before the splittable ones. See the Slice inner class for more - /// details. - SmallVector Slices; - /// Instructions which will become dead if we rewrite the alloca. /// /// Note that these are not separated by slice. This is because we expect an @@ -339,6 +449,25 @@ /// want to swap this particular input for undef to simplify the use lists of /// the alloca. SmallVector DeadOperands; + +public: + /// The slices of the alloca. + /// + /// We store a vector of the slices formed by uses of the alloca here. This + /// vector is sorted by increasing begin offset, and then the unsplittable + /// slices before the splittable ones. See the Slice inner class for more + /// details. + SmallVector Slices; + + /// NOTE: The following two fields are valid iff AllocaSlicesRewriter::insert + /// hasn't been called. + /// + /// A map from basic block pointer to the access pattern bit map. + SmallDenseMap allocAccessBitMapPerBB; + + /// Records the slice intervals per basic block, for all constant stores. + SmallDenseMap>> + constantStoreIntervalPerBB; }; /// A partition of the slices. @@ -657,7 +786,15 @@ SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) : PtrUseVisitor(DL), AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()), - AS(AS) {} + AS(AS) { + Function *funcPtr = AI.getParent()->getParent(); + for (Function::iterator FI = funcPtr->begin(), FE = funcPtr->end(); + FI != FE; ++FI) { + BasicBlock *BBPtr = &*FI; + AS.allocAccessBitMapPerBB.insert( + std::make_pair(BBPtr, APInt(accessTypeBitMapLength(), 0))); + } + } private: void markAsDead(Instruction &I) { @@ -665,6 +802,7 @@ AS.DeadUsers.push_back(&I); } + // Constructs Slice and appends it to the internal states of AllocaSlices. void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or @@ -763,14 +901,58 @@ return Base::visitGetElementPtrInst(GEPI); } + void InsertOrUpdateBasicBlockAccessBitMap(BasicBlock *BBPtr, + const unsigned bitPos) { + auto iter = AS.allocAccessBitMapPerBB.find(BBPtr); + if (iter != AS.allocAccessBitMapPerBB.end()) { + iter->second.setBit(bitPos); + } else { + AS.allocAccessBitMapPerBB.insert(std::make_pair( + BBPtr, APInt::getOneBitSet(accessTypeBitMapLength(), bitPos))); + } + } + void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { + assert((isa(I) || isa(I)) && + "Expected load or store instruction"); // We allow splitting of non-volatile loads and stores where the type is an // integer type. These may be used to implement 'memcpy' or other "transfer // of bits" patterns. bool IsSplittable = Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty); + if (isa(I)) { + InsertOrUpdateBasicBlockAccessBitMap( + I.getParent(), + IsVolatile + ? accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage) + : ((Size == AllocSize) + ? accessTypeToBitMapIndex(AllocaAccessKind::ReadAsAWhole) + : accessTypeToBitMapIndex( + AllocaAccessKind::ReadByScalar))); + } else { + assert(isa(I) && "Expected a store instruction"); + StoreInst *storeInstPtr = dyn_cast(&I); + bool storeConstant = isa(storeInstPtr->getValueOperand()); + + const int bitPos = + IsVolatile + ? (accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage)) + : ((Size == AllocSize) + ? accessTypeToBitMapIndex(AllocaAccessKind::StoreAsAWhole) + : (storeConstant + ? accessTypeToBitMapIndex( + AllocaAccessKind::ConstantStoreByScalar) + : accessTypeToBitMapIndex( + AllocaAccessKind::NonConstantStoreByScalar))); + if (storeConstant) { + AS.constantStoreIntervalPerBB[I.getParent()].push_back(std::make_pair( + Offset.getLimitedValue(), Offset.getLimitedValue() + Size)); + } + InsertOrUpdateBasicBlockAccessBitMap(I.getParent(), bitPos); + } + insertUse(I, Offset, Size, IsSplittable); } @@ -845,8 +1027,14 @@ if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace()) return PI.setAborted(&II); - insertUse(II, Offset, Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), + InsertOrUpdateBasicBlockAccessBitMap( + II.getParent(), + + (accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage))); + + insertUse(II, Offset, + Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), (bool)Length); } @@ -864,6 +1052,15 @@ if (!IsOffsetKnown) return PI.setAborted(&II); + const uint64_t RawOffset = Offset.getLimitedValue(); + const uint64_t SliceSize = + Length ? Length->getLimitedValue() : AllocSize - RawOffset; + + // Regard all `memcpy` and `memmove` as other usages. + // There is a separate pass to optimize `memcpy`. + InsertOrUpdateBasicBlockAccessBitMap( + II.getParent(), accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage)); + // Don't replace this with a load/store with a different address space. // TODO: Use a store with the casted new alloca? if (II.isVolatile() && @@ -884,9 +1081,6 @@ return markAsDead(II); } - uint64_t RawOffset = Offset.getLimitedValue(); - uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; - // Check for the special case where the same exact value is used for both // source and dest. if (*U == II.getRawDest() && *U == II.getRawSource()) { @@ -894,7 +1088,7 @@ if (!II.isVolatile()) return markAsDead(II); - return insertUse(II, Offset, Size, /*IsSplittable=*/false); + return insertUse(II, Offset, SliceSize, /*IsSplittable=*/false); } // If we have seen both source and destination for a mem transfer, then @@ -920,7 +1114,7 @@ } // Insert the use now that we've fixed up the splittable nature. - insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); + insertUse(II, Offset, SliceSize, /*IsSplittable=*/Inserted && Length); // Check that we ended up with a valid index in the map. assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && @@ -929,6 +1123,7 @@ // Disable SRoA for any intrinsics except for lifetime invariants and // invariant group. + // // FIXME: What about debug intrinsics? This matches old behavior, but // doesn't make sense. void visitIntrinsicInst(IntrinsicInst &II) { @@ -940,15 +1135,24 @@ if (!IsOffsetKnown) return PI.setAborted(&II); + // For lifetime invariants, record the access kind as `OtherUsage`. if (II.isLifetimeStartOrEnd()) { ConstantInt *Length = cast(II.getArgOperand(0)); uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), Length->getLimitedValue()); + InsertOrUpdateBasicBlockAccessBitMap( + II.getParent(), + accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage)); + insertUse(II, Offset, Size, true); return; } + // For invariant group, record the access kind as `OtherUsage`. if (II.isLaunderOrStripInvariantGroup()) { + InsertOrUpdateBasicBlockAccessBitMap( + II.getParent(), + accessTypeToBitMapIndex(AllocaAccessKind::OtherUsage)); enqueueUsers(II); return; } @@ -1051,6 +1255,12 @@ return; } + InsertOrUpdateBasicBlockAccessBitMap( + I.getParent(), + (Offset.getLimitedValue() == 0 && Size == AllocSize) + ? accessTypeToBitMapIndex(AllocaAccessKind::ReadAsAWhole) + : accessTypeToBitMapIndex(AllocaAccessKind::ReadByScalar)); + insertUse(I, Offset, Size); } @@ -1063,7 +1273,7 @@ }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) - : + : AllocaSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()), #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif @@ -1084,6 +1294,22 @@ // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. llvm::stable_sort(Slices); + + // Sort the intervals of each basic block. + for (auto IB = constantStoreIntervalPerBB.begin(), + IE = constantStoreIntervalPerBB.end(); + IB != IE; IB++) { + llvm::stable_sort(IB->second, [](const std::pair &a, + const std::pair &b) { + if (a.first < b.first) + return true; + if (a.first > b.first) + return false; + if (a.second > b.second) + return true; + return false; + }); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3228,9 +3454,11 @@ /// Rewrite loads and stores through a pointer and all pointers derived from /// it. - bool rewrite(Instruction &I) { + bool rewrite(AllocaInst &AI) { + LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); - enqueueUsers(I); + + enqueueUsers(AI); bool Changed = false; while (!Queue.empty()) { U = Queue.pop_back_val(); @@ -3942,6 +4170,9 @@ // on them. SmallPtrSet ResplitPromotableAllocas; + // Tracks the alloc access bit map for newly added instructions. + SmallDenseMap incrementalAllocAccessBitMapPerBasicBlock; + // At this point, we have collected all of the loads and stores we can // pre-split, and the specific splits needed for them. We actually do the // splitting in a specific order in order to handle when one of the loads in @@ -3993,6 +4224,21 @@ // to rewrite the stores. SplitLoads.push_back(PLoad); + const unsigned bitPos = + PartSize == LoadSize + ? accessTypeToBitMapIndex(AllocaAccessKind::ReadAsAWhole) + : accessTypeToBitMapIndex(AllocaAccessKind::ReadByScalar); + + auto iter = + incrementalAllocAccessBitMapPerBasicBlock.find(LI->getParent()); + if (iter != incrementalAllocAccessBitMapPerBasicBlock.end()) { + iter->second.setBit(bitPos); + } else { + incrementalAllocAccessBitMapPerBasicBlock.insert(std::make_pair( + LI->getParent(), + APInt::getOneBitSet(accessTypeBitMapLength(), bitPos))); + } + // Now build a new slice for the alloca. NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, @@ -4147,6 +4393,26 @@ PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); + const bool storeValIsConstant = isa(PStore->getValueOperand()); + const int bitPos = + (PartSize == StoreSize) + ? accessTypeToBitMapIndex(AllocaAccessKind::StoreAsAWhole) + : (storeValIsConstant + ? accessTypeToBitMapIndex( + AllocaAccessKind::ConstantStoreByScalar) + : accessTypeToBitMapIndex( + AllocaAccessKind::NonConstantStoreByScalar)); + + auto iter = + incrementalAllocAccessBitMapPerBasicBlock.find(SI->getParent()); + if (iter != incrementalAllocAccessBitMapPerBasicBlock.end()) { + iter->second.setBit(bitPos); + } else { + incrementalAllocAccessBitMapPerBasicBlock.insert(std::make_pair( + SI->getParent(), + APInt::getOneBitSet(accessTypeBitMapLength(), bitPos))); + } + // Now build a new slice for the alloca. NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, @@ -4209,6 +4475,8 @@ // Insert our new slices. This will sort and merge them into the sorted // sequence. AS.insert(NewSlices); + // Insert the incremental alloca access bit map. + AS.insertAllocAccessMap(incrementalAllocAccessBitMapPerBasicBlock); LLVM_DEBUG(dbgs() << " Pre-split slices:\n"); #ifndef NDEBUG @@ -4385,6 +4653,55 @@ return NewAI; } +// For each basic block in `BBPtrToFurtherAnalyze`, analyze the intervals of +// store instructions which stores constant value, and returns true iff constant +// stores cover the whole allocated address. +// +// FIXME: Cover the scenario when constant stores effectively covers all +// slices but `AllocaSize` is larger due to padding (e.g. see +// `test_struct_of_int_char` in test/Transforms/SROA/alloca-struct.ll). +// +// Caller guarantees that there are no scalar usage in the basic blocks to +// analyze. +bool SROAPass::scalarConstantStoresEquivalentToWholeUsage( + SmallVector &BBPtrToFurtherAnalyze, AllocaSlices &AS, + uint64_t AllocaSize) { + bool wholeUsage = true; + while (!BBPtrToFurtherAnalyze.empty()) { + BasicBlock *BBPtr = BBPtrToFurtherAnalyze.pop_back_val(); + const auto &constantStoreIntervals = AS.constantStoreIntervalPerBB[BBPtr]; + + assert((!constantStoreIntervals.empty()) && + "A basic block to analyze " + "should at least have scalar constant stores"); + + auto iter = constantStoreIntervals.begin(); + uint64_t MinOffset = iter->first, MaxOffset = iter->second; + assert((MinOffset <= MaxOffset) && "MinOffset should be smaller than or " + "equal to MaxOffset"); + for (++iter; iter != constantStoreIntervals.end(); ++iter) { + assert((iter->first <= iter->second) && + "MinOffset should be smaller than or equal to MaxOffset"); + if (iter->first > MaxOffset) { + // There are gaps in the intervals. + wholeUsage = false; + break; + } + + MinOffset = std::min(MinOffset, iter->first); + MaxOffset = std::max(MaxOffset, iter->second); + } // end for + if (MinOffset != 0 || MaxOffset != AllocaSize) { + wholeUsage = false; + } + + if (!wholeUsage) { + break; + } + } // end while + return wholeUsage; +} + /// Walks the slices of an alloca and form partitions based on them, /// rewriting each of their uses. bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { @@ -4396,6 +4713,12 @@ const DataLayout &DL = AI.getModule()->getDataLayout(); // First try to pre-split loads and stores. + // Note, `presplitLoadsAndStores` may insert new instructions and + // corresponding Slices, as well as update `allocAccessBitMapPerBB` and + // `allocAccessBitMapPerBB` accordingly. + // When this function has any update (i.e., presplit is true), the access + // pattern analysis below won't happen. + // FIXME: Make access pattern analysis work when `presplit` is true. Changed |= presplitLoadsAndStores(AI, AS); // Now that we have identified any pre-splitting opportunities, @@ -4410,30 +4733,62 @@ DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize(); const uint64_t MaxBitVectorSize = 1024; if (AllocaSize <= MaxBitVectorSize) { - // If a byte boundary is included in any load or store, a slice starting or - // ending at the boundary is not splittable. - SmallBitVector SplittableOffset(AllocaSize + 1, true); - for (Slice &S : AS) - for (unsigned O = S.beginOffset() + 1; - O < S.endOffset() && O < AllocaSize; O++) - SplittableOffset.reset(O); + bool allocaHasScalarUsage = false; + SmallVector BBPtrToFurtherAnalyze; + for (const auto &kv : AS.allocAccessBitMapPerBB) { + BasicBlock *BBPtr = kv.first; + APInt access = kv.second; + if (shouldSplitAllocaPerAccessBitMap(access)) { + allocaHasScalarUsage = true; + break; + } + if (access[accessTypeToBitMapIndex( + AllocaAccessKind::ConstantStoreByScalar)]) { + BBPtrToFurtherAnalyze.push_back(BBPtr); + } + } - for (Slice &S : AS) { - if (!S.isSplittable()) - continue; + if ((!Changed) && (!allocaHasScalarUsage) && + scalarConstantStoresEquivalentToWholeUsage(BBPtrToFurtherAnalyze, AS, + AllocaSize)) { + // Scalar load has been checked and excluded, there must be a load as + // whole as well as corresponding whole slice. + // + // Make each slice unsplittable so they form one partition; this way + // the alloca instruction won't be split. + // + // Rather than return early, function will proceed so that + // `SROA::rewritePartition` will be invoked and the logic to evaluate + // evaluate mem2reg readiness is reused. + for (Slice &S : AS) { + S.makeUnsplittable(); + } + } else { + // If a byte boundary is included in any load or store, a slice starting + // or ending at the boundary is not splittable. + SmallBitVector SplittableOffset(AllocaSize + 1, true); + for (Slice &S : AS) + for (unsigned O = S.beginOffset() + 1; + O < S.endOffset() && O < AllocaSize; O++) + SplittableOffset.reset(O); + + for (Slice &S : AS) { + if (!S.isSplittable()) + continue; - if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && - (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) - continue; + if ((S.beginOffset() > AllocaSize || + SplittableOffset[S.beginOffset()]) && + (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) + continue; - if (isa(S.getUse()->getUser()) || - isa(S.getUse()->getUser())) { - S.makeUnsplittable(); - IsSorted = false; + if (isa(S.getUse()->getUser()) || + isa(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } } } - } - else { + } else { // We only allow whole-alloca splittable loads and stores // for a large alloca to avoid creating too large BitVector. for (Slice &S : AS) { @@ -4656,7 +5011,8 @@ bool Changed = false; while (!DeadInsts.empty()) { Instruction *I = dyn_cast_or_null(DeadInsts.pop_back_val()); - if (!I) continue; + if (!I) + continue; LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); // If the instruction is an alloca, find the possible dbg.declare connected diff --git a/llvm/test/Transforms/SROA/alloca-struct.ll b/llvm/test/Transforms/SROA/alloca-struct.ll --- a/llvm/test/Transforms/SROA/alloca-struct.ll +++ b/llvm/test/Transforms/SROA/alloca-struct.ll @@ -72,24 +72,15 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[TEST:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: +; CHECK-NEXT: [[RETVAL_SROA_0_0_INSERT_MASK:%.*]] = and i64 undef, -4294967296 +; CHECK-NEXT: [[RETVAL_SROA_0_4_INSERT_MASK:%.*]] = and i64 [[RETVAL_SROA_0_0_INSERT_MASK]], 4294967295 ; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CALL:%.*]] = call i64 [[P:%.*]]() -; CHECK-NEXT: [[RETVAL_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[CALL]] to i32 -; CHECK-NEXT: [[RETVAL_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[CALL]], 32 -; CHECK-NEXT: [[RETVAL_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[RETVAL_SROA_3_0_EXTRACT_SHIFT]] to i32 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_SROA_3_0:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[RETVAL_SROA_3_0_EXTRACT_TRUNC]], [[IF_END]] ] -; CHECK-NEXT: [[RETVAL_SROA_0_0:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[RETVAL_SROA_0_0_EXTRACT_TRUNC]], [[IF_END]] ] -; CHECK-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i32 [[RETVAL_SROA_3_0]] to i64 -; CHECK-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 -; CHECK-NEXT: [[RETVAL_SROA_3_0_INSERT_MASK:%.*]] = and i64 undef, 4294967295 -; CHECK-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_MASK]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] -; CHECK-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[RETVAL_SROA_0_0]] to i64 -; CHECK-NEXT: [[RETVAL_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], -4294967296 -; CHECK-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_0_0_INSERT_MASK]], [[RETVAL_SROA_0_0_INSERT_EXT]] -; CHECK-NEXT: ret i64 [[RETVAL_SROA_0_0_INSERT_INSERT]] +; CHECK-NEXT: [[RETVAL_SROA_0_0:%.*]] = phi i64 [ [[RETVAL_SROA_0_4_INSERT_MASK]], [[IF_THEN]] ], [ [[CALL]], [[IF_END]] ] +; CHECK-NEXT: ret i64 [[RETVAL_SROA_0_0]] ; entry: diff --git a/llvm/test/Transforms/SROA/basictest-opaque-ptrs.ll b/llvm/test/Transforms/SROA/basictest-opaque-ptrs.ll --- a/llvm/test/Transforms/SROA/basictest-opaque-ptrs.ll +++ b/llvm/test/Transforms/SROA/basictest-opaque-ptrs.ll @@ -118,9 +118,9 @@ ; CHECK-LABEL: @test2_addrspacecast_gep_offset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [10 x i8], align 1 -; CHECK-NEXT: [[A_SROA_0_2_GEPB_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i16 2 -; CHECK-NEXT: [[A_SROA_0_2_GEPB_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_0_2_GEPB_SROA_IDX]] to ptr addrspace(1) -; CHECK-NEXT: store i64 [[X:%.*]], ptr addrspace(1) [[A_SROA_0_2_GEPB_SROA_CAST]], align 1 +; CHECK-NEXT: [[A_SROA_0_2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i16 2 +; CHECK-NEXT: [[A_SROA_0_2_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_0_2_SROA_IDX]] to ptr addrspace(1) +; CHECK-NEXT: store i64 [[X:%.*]], ptr addrspace(1) [[A_SROA_0_2_SROA_CAST]], align 1 ; CHECK-NEXT: br label [[L2:%.*]] ; CHECK: L2: ; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_30_Z:%.*]] = load i64, ptr [[A_SROA_0]], align 1 @@ -149,7 +149,7 @@ ; CHECK-NEXT: [[A_SROA_32:%.*]] = alloca [16 x i8], align 1 ; CHECK-NEXT: [[A_SROA_15:%.*]] = alloca [42 x i8], align 1 ; CHECK-NEXT: [[A_SROA_16:%.*]] = alloca [7 x i8], align 1 -; CHECK-NEXT: [[A_SROA_234:%.*]] = alloca [7 x i8], align 1 +; CHECK-NEXT: [[A_SROA_235:%.*]] = alloca [7 x i8], align 1 ; CHECK-NEXT: [[A_SROA_31:%.*]] = alloca [85 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_0]], ptr align 8 [[SRC:%.*]], i32 42, i1 false), !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[A_SROA_2_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 42 @@ -164,61 +164,61 @@ ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_16]], ptr align 8 [[A_SROA_16_0_SRC_SROA_IDX]], i32 7, i1 false), !tbaa [[TBAA0]] ; CHECK-NEXT: [[A_SROA_23_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 207 ; CHECK-NEXT: [[A_SROA_23_0_COPYLOAD:%.*]] = load i8, ptr [[A_SROA_23_0_SRC_SROA_IDX]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[A_SROA_234_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 208 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_234]], ptr align 8 [[A_SROA_234_0_SRC_SROA_IDX]], i32 7, i1 false), !tbaa [[TBAA0]] +; CHECK-NEXT: [[A_SROA_235_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 208 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_235]], ptr align 8 [[A_SROA_235_0_SRC_SROA_IDX]], i32 7, i1 false), !tbaa [[TBAA0]] ; CHECK-NEXT: [[A_SROA_31_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 215 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_31]], ptr align 1 [[A_SROA_31_0_SRC_SROA_IDX]], i32 85, i1 false), !tbaa [[TBAA0]] ; CHECK-NEXT: store i8 1, ptr [[A_SROA_32]], align 1, !tbaa [[TBAA3:![0-9]+]] ; CHECK-NEXT: store i16 1, ptr [[A_SROA_32]], align 1, !tbaa [[TBAA5:![0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[A_SROA_32]], align 1, !tbaa [[TBAA7:![0-9]+]] ; CHECK-NEXT: store i64 1, ptr [[A_SROA_32]], align 1, !tbaa [[TBAA9:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_1_OVERLAP_2_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 1 -; CHECK-NEXT: store i64 2, ptr [[A_SROA_32_1_OVERLAP_2_I64_SROA_IDX]], align 1, !tbaa [[TBAA11:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_2_OVERLAP_3_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 2 -; CHECK-NEXT: store i64 3, ptr [[A_SROA_32_2_OVERLAP_3_I64_SROA_IDX]], align 1, !tbaa [[TBAA13:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_3_OVERLAP_4_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 3 -; CHECK-NEXT: store i64 4, ptr [[A_SROA_32_3_OVERLAP_4_I64_SROA_IDX]], align 1, !tbaa [[TBAA15:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_4_OVERLAP_5_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 4 -; CHECK-NEXT: store i64 5, ptr [[A_SROA_32_4_OVERLAP_5_I64_SROA_IDX]], align 1, !tbaa [[TBAA17:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_5_OVERLAP_6_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 5 -; CHECK-NEXT: store i64 6, ptr [[A_SROA_32_5_OVERLAP_6_I64_SROA_IDX]], align 1, !tbaa [[TBAA19:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_6_OVERLAP_7_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 6 -; CHECK-NEXT: store i64 7, ptr [[A_SROA_32_6_OVERLAP_7_I64_SROA_IDX]], align 1, !tbaa [[TBAA21:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_7_OVERLAP_8_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 7 -; CHECK-NEXT: store i64 8, ptr [[A_SROA_32_7_OVERLAP_8_I64_SROA_IDX]], align 1, !tbaa [[TBAA23:![0-9]+]] -; CHECK-NEXT: [[A_SROA_32_8_OVERLAP_9_I64_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 8 -; CHECK-NEXT: store i64 9, ptr [[A_SROA_32_8_OVERLAP_9_I64_SROA_IDX]], align 1, !tbaa [[TBAA25:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 1 +; CHECK-NEXT: store i64 2, ptr [[A_SROA_32_1_SROA_IDX]], align 1, !tbaa [[TBAA11:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 2 +; CHECK-NEXT: store i64 3, ptr [[A_SROA_32_2_SROA_IDX]], align 1, !tbaa [[TBAA13:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 3 +; CHECK-NEXT: store i64 4, ptr [[A_SROA_32_3_SROA_IDX]], align 1, !tbaa [[TBAA15:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 4 +; CHECK-NEXT: store i64 5, ptr [[A_SROA_32_4_SROA_IDX]], align 1, !tbaa [[TBAA17:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_5_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 5 +; CHECK-NEXT: store i64 6, ptr [[A_SROA_32_5_SROA_IDX]], align 1, !tbaa [[TBAA19:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 6 +; CHECK-NEXT: store i64 7, ptr [[A_SROA_32_6_SROA_IDX]], align 1, !tbaa [[TBAA21:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_7_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 7 +; CHECK-NEXT: store i64 8, ptr [[A_SROA_32_7_SROA_IDX]], align 1, !tbaa [[TBAA23:![0-9]+]] +; CHECK-NEXT: [[A_SROA_32_8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_32]], i64 8 +; CHECK-NEXT: store i64 9, ptr [[A_SROA_32_8_SROA_IDX]], align 1, !tbaa [[TBAA25:![0-9]+]] ; CHECK-NEXT: store i8 1, ptr [[A_SROA_16]], align 1, !tbaa [[TBAA27:![0-9]+]] ; CHECK-NEXT: store i16 1, ptr [[A_SROA_16]], align 1, !tbaa [[TBAA29:![0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[A_SROA_16]], align 1, !tbaa [[TBAA31:![0-9]+]] -; CHECK-NEXT: [[A_SROA_16_1_OVERLAP2_1_1_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 1 -; CHECK-NEXT: store i32 2, ptr [[A_SROA_16_1_OVERLAP2_1_1_I32_SROA_IDX]], align 1, !tbaa [[TBAA33:![0-9]+]] -; CHECK-NEXT: [[A_SROA_16_2_OVERLAP2_1_2_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 2 -; CHECK-NEXT: store i32 3, ptr [[A_SROA_16_2_OVERLAP2_1_2_I32_SROA_IDX]], align 1, !tbaa [[TBAA35:![0-9]+]] -; CHECK-NEXT: [[A_SROA_16_3_OVERLAP2_1_3_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 3 -; CHECK-NEXT: store i32 4, ptr [[A_SROA_16_3_OVERLAP2_1_3_I32_SROA_IDX]], align 1, !tbaa [[TBAA37:![0-9]+]] -; CHECK-NEXT: store i32 1, ptr [[A_SROA_234]], align 1, !tbaa [[TBAA39:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_1_OVERLAP2_2_1_I8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 1 -; CHECK-NEXT: store i8 1, ptr [[A_SROA_234_1_OVERLAP2_2_1_I8_SROA_IDX]], align 1, !tbaa [[TBAA41:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_1_OVERLAP2_2_1_I16_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 1 -; CHECK-NEXT: store i16 1, ptr [[A_SROA_234_1_OVERLAP2_2_1_I16_SROA_IDX]], align 1, !tbaa [[TBAA43:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_1_OVERLAP2_2_1_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 1 -; CHECK-NEXT: store i32 1, ptr [[A_SROA_234_1_OVERLAP2_2_1_I32_SROA_IDX]], align 1, !tbaa [[TBAA45:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_2_OVERLAP2_2_2_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 2 -; CHECK-NEXT: store i32 3, ptr [[A_SROA_234_2_OVERLAP2_2_2_I32_SROA_IDX]], align 1, !tbaa [[TBAA47:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_3_OVERLAP2_2_3_I32_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 3 -; CHECK-NEXT: store i32 4, ptr [[A_SROA_234_3_OVERLAP2_2_3_I32_SROA_IDX]], align 1, !tbaa [[TBAA49:![0-9]+]] -; CHECK-NEXT: [[A_SROA_15_197_OVERLAP2_PREFIX_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_15]], i64 39 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_15_197_OVERLAP2_PREFIX_SROA_IDX]], ptr align 1 [[SRC]], i32 3, i1 false), !tbaa [[TBAA51:![0-9]+]] +; CHECK-NEXT: [[A_SROA_16_1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 1 +; CHECK-NEXT: store i32 2, ptr [[A_SROA_16_1_SROA_IDX]], align 1, !tbaa [[TBAA33:![0-9]+]] +; CHECK-NEXT: [[A_SROA_16_2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 2 +; CHECK-NEXT: store i32 3, ptr [[A_SROA_16_2_SROA_IDX]], align 1, !tbaa [[TBAA35:![0-9]+]] +; CHECK-NEXT: [[A_SROA_16_3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 3 +; CHECK-NEXT: store i32 4, ptr [[A_SROA_16_3_SROA_IDX]], align 1, !tbaa [[TBAA37:![0-9]+]] +; CHECK-NEXT: store i32 1, ptr [[A_SROA_235]], align 1, !tbaa [[TBAA39:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_1_SROA_IDX11:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 1 +; CHECK-NEXT: store i8 1, ptr [[A_SROA_235_1_SROA_IDX11]], align 1, !tbaa [[TBAA41:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_1_SROA_IDX10:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 1 +; CHECK-NEXT: store i16 1, ptr [[A_SROA_235_1_SROA_IDX10]], align 1, !tbaa [[TBAA43:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 1 +; CHECK-NEXT: store i32 1, ptr [[A_SROA_235_1_SROA_IDX]], align 1, !tbaa [[TBAA45:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 2 +; CHECK-NEXT: store i32 3, ptr [[A_SROA_235_2_SROA_IDX]], align 1, !tbaa [[TBAA47:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 3 +; CHECK-NEXT: store i32 4, ptr [[A_SROA_235_3_SROA_IDX]], align 1, !tbaa [[TBAA49:![0-9]+]] +; CHECK-NEXT: [[A_SROA_15_197_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_15]], i64 39 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_15_197_SROA_IDX]], ptr align 1 [[SRC]], i32 3, i1 false), !tbaa [[TBAA51:![0-9]+]] ; CHECK-NEXT: [[A_SROA_16_197_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_16]], ptr align 1 [[A_SROA_16_197_SRC_SROA_IDX]], i32 5, i1 false), !tbaa [[TBAA51]] -; CHECK-NEXT: [[A_SROA_16_2_OVERLAP2_1_2_I8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 2 -; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_16_2_OVERLAP2_1_2_I8_SROA_IDX]], i8 42, i32 5, i1 false), !tbaa [[TBAA53:![0-9]+]] -; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_234]], i8 42, i32 2, i1 false), !tbaa [[TBAA53]] -; CHECK-NEXT: [[A_SROA_234_209_OVERLAP2_2_1_I8_SROA_IDX5:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_234_209_OVERLAP2_2_1_I8_SROA_IDX5]], ptr align 1 [[SRC]], i32 5, i1 false), !tbaa [[TBAA55:![0-9]+]] -; CHECK-NEXT: [[A_SROA_234_210_OVERLAP2_2_2_I8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_234]], i64 2 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_234_210_OVERLAP2_2_2_I8_SROA_IDX]], ptr align 1 [[SRC]], i32 5, i1 false), !tbaa [[TBAA57:![0-9]+]] +; CHECK-NEXT: [[A_SROA_16_2_SROA_IDX12:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_16]], i64 2 +; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_16_2_SROA_IDX12]], i8 42, i32 5, i1 false), !tbaa [[TBAA53:![0-9]+]] +; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_235]], i8 42, i32 2, i1 false), !tbaa [[TBAA53]] +; CHECK-NEXT: [[A_SROA_235_209_SROA_IDX8:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_235_209_SROA_IDX8]], ptr align 1 [[SRC]], i32 5, i1 false), !tbaa [[TBAA55:![0-9]+]] +; CHECK-NEXT: [[A_SROA_235_210_SROA_IDX9:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_235]], i64 2 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_235_210_SROA_IDX9]], ptr align 1 [[SRC]], i32 5, i1 false), !tbaa [[TBAA57:![0-9]+]] ; CHECK-NEXT: [[A_SROA_31_210_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 5 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_31]], ptr align 1 [[A_SROA_31_210_SRC_SROA_IDX]], i32 3, i1 false), !tbaa [[TBAA57]] ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST:%.*]], ptr align 1 [[A_SROA_0]], i32 42, i1 false), !tbaa [[TBAA59:![0-9]+]] @@ -234,8 +234,8 @@ ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_16_0_DST_SROA_IDX]], ptr align 1 [[A_SROA_16]], i32 7, i1 false), !tbaa [[TBAA59]] ; CHECK-NEXT: [[A_SROA_23_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 207 ; CHECK-NEXT: store i8 42, ptr [[A_SROA_23_0_DST_SROA_IDX]], align 1, !tbaa [[TBAA59]] -; CHECK-NEXT: [[A_SROA_234_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 208 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_234_0_DST_SROA_IDX]], ptr align 1 [[A_SROA_234]], i32 7, i1 false), !tbaa [[TBAA59]] +; CHECK-NEXT: [[A_SROA_235_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 208 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_235_0_DST_SROA_IDX]], ptr align 1 [[A_SROA_235]], i32 7, i1 false), !tbaa [[TBAA59]] ; CHECK-NEXT: [[A_SROA_31_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 215 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_31_0_DST_SROA_IDX]], ptr align 1 [[A_SROA_31]], i32 85, i1 false), !tbaa [[TBAA59]] ; CHECK-NEXT: ret void @@ -627,21 +627,13 @@ ; ; CHECK-LABEL: @test12( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_3_0_INSERT_EXT]], 16 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_MASK:%.*]] = and i24 undef, 65535 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_SHIFT]] -; CHECK-NEXT: [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_2_0_INSERT_EXT]], 8 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_3_0_INSERT_INSERT]], -65281 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_SHIFT]] -; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_2_0_INSERT_INSERT]], -256 -; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_EXT]] -; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_0_INSERT_INSERT]] to i8 -; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 8 +; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 undef, -256 +; CHECK-NEXT: [[A_SROA_0_1_INSERT_MASK:%.*]] = and i24 [[A_SROA_0_0_INSERT_MASK]], -65281 +; CHECK-NEXT: [[A_SROA_0_2_INSERT_MASK:%.*]] = and i24 [[A_SROA_0_1_INSERT_MASK]], 65535 +; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_2_INSERT_MASK]] to i8 +; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_2_INSERT_MASK]], 8 ; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 16 +; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_2_INSERT_MASK]], 16 ; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8 ; CHECK-NEXT: [[BSUM0:%.*]] = add i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[B_SROA_2_0_EXTRACT_TRUNC]] ; CHECK-NEXT: [[BSUM1:%.*]] = add i8 [[BSUM0]], [[B_SROA_3_0_EXTRACT_TRUNC]] @@ -1456,20 +1448,20 @@ ; CHECK: bb4: ; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3 ; CHECK-NEXT: [[SRC_3:%.*]] = load i8, ptr [[SRC_GEP3]], align 1 -; CHECK-NEXT: [[TMP_SROA_0_3_TMP_GEP3_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 3 -; CHECK-NEXT: store i8 [[SRC_3]], ptr [[TMP_SROA_0_3_TMP_GEP3_SROA_IDX3]], align 1 +; CHECK-NEXT: [[TMP_SROA_0_3_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 3 +; CHECK-NEXT: store i8 [[SRC_3]], ptr [[TMP_SROA_0_3_SROA_IDX3]], align 1 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[SRC_GEP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 2 ; CHECK-NEXT: [[SRC_2:%.*]] = load i8, ptr [[SRC_GEP2]], align 1 -; CHECK-NEXT: [[TMP_SROA_0_2_TMP_GEP2_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 2 -; CHECK-NEXT: store i8 [[SRC_2]], ptr [[TMP_SROA_0_2_TMP_GEP2_SROA_IDX2]], align 2 +; CHECK-NEXT: [[TMP_SROA_0_2_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 2 +; CHECK-NEXT: store i8 [[SRC_2]], ptr [[TMP_SROA_0_2_SROA_IDX2]], align 2 ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: ; CHECK-NEXT: [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 1 ; CHECK-NEXT: [[SRC_1:%.*]] = load i8, ptr [[SRC_GEP1]], align 1 -; CHECK-NEXT: [[TMP_SROA_0_1_TMP_GEP1_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 1 -; CHECK-NEXT: store i8 [[SRC_1]], ptr [[TMP_SROA_0_1_TMP_GEP1_SROA_IDX1]], align 1 +; CHECK-NEXT: [[TMP_SROA_0_1_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP_SROA_0]], i64 1 +; CHECK-NEXT: store i8 [[SRC_1]], ptr [[TMP_SROA_0_1_SROA_IDX1]], align 1 ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb1: ; CHECK-NEXT: [[SRC_GEP0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 0 diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -693,21 +693,13 @@ ; ; CHECK-LABEL: @test12( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_3_0_INSERT_EXT]], 16 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_MASK:%.*]] = and i24 undef, 65535 -; CHECK-NEXT: [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_SHIFT]] -; CHECK-NEXT: [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_2_0_INSERT_EXT]], 8 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_3_0_INSERT_INSERT]], -65281 -; CHECK-NEXT: [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_SHIFT]] -; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i24 -; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_2_0_INSERT_INSERT]], -256 -; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_EXT]] -; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_0_INSERT_INSERT]] to i8 -; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 8 +; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 undef, -256 +; CHECK-NEXT: [[A_SROA_0_1_INSERT_MASK:%.*]] = and i24 [[A_SROA_0_0_INSERT_MASK]], -65281 +; CHECK-NEXT: [[A_SROA_0_2_INSERT_MASK:%.*]] = and i24 [[A_SROA_0_1_INSERT_MASK]], 65535 +; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_2_INSERT_MASK]] to i8 +; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_2_INSERT_MASK]], 8 ; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 16 +; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_2_INSERT_MASK]], 16 ; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8 ; CHECK-NEXT: [[BSUM0:%.*]] = add i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[B_SROA_2_0_EXTRACT_TRUNC]] ; CHECK-NEXT: [[BSUM1:%.*]] = add i8 [[BSUM0]], [[B_SROA_3_0_EXTRACT_TRUNC]] diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll --- a/llvm/test/Transforms/SROA/big-endian.ll +++ b/llvm/test/Transforms/SROA/big-endian.ll @@ -25,17 +25,11 @@ %ai = load i24, i24* %aiptr ; CHECK-NOT: store ; CHECK-NOT: load -; CHECK: %[[ext2:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256 -; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]] -; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8 -; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281 -; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]] -; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16 -; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535 -; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]] +; CHECK: %[[insert0:.*]] = and i24 undef, 65535 +; CHECK-NEXT: %[[insert1:.*]] = and i24 %[[insert0]], -65281 +; CHECK-NEXT: %[[insert2:.*]] = and i24 %[[insert1]], -256 + + %biptr = bitcast [3 x i8]* %b to i24* store i24 %ai, i24* %biptr @@ -47,11 +41,11 @@ %b2 = load i8, i8* %b2ptr ; CHECK-NOT: store ; CHECK-NOT: load -; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16 +; CHECK: %[[shift0:.*]] = lshr i24 %[[insert2]], 16 ; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8 -; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8 +; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert2]], 8 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8 -; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8 +; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert2]] to i8 %bsum0 = add i8 %b0, %b1 %bsum1 = add i8 %bsum0, %b2 @@ -66,6 +60,7 @@ ; promoted. ; ; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: entry: %a = alloca [7 x i8] @@ -82,50 +77,47 @@ %a0i16ptr = bitcast i8* %a0ptr to i16* store i16 1, i16* %a0i16ptr +; set a[0] to 1 +; Constant value is 0xFF FFFF FFFF in hexadecimal. +; CHECK-NEXT: [[mask0:%.*]] = and i56 undef, 1099511627775 +; Constant value is 0x100 0000 0000 in hexadecimal. +; CHECK-NEXT: [[insert0:%.*]] = or i56 [[mask0]], 1099511627776 + store i8 1, i8* %a2ptr +; set a[2] to 1 +; Constant value is 0xFF 0000 0001 in hexadecimal. +; CHECK-NEXT: [[mask1:%.*]] = and i56 [[insert0]], -1095216660481 +; Constant value is 0x1 0000 0000 in hexadecimal. +; CHECK-NEXT: [[insert2:%.*]] = or i56 [[mask1]], 4294967296 + %a3i24ptr = bitcast i8* %a3ptr to i24* store i24 1, i24* %a3i24ptr +; mask to get a[3] to a[5] +; Constant value is 0xFFFF FF01 in hexadecimal. +; CHECK-NEXT: [[mask3:%.*]] = and i56 [[insert2]], -4294967041 +; set a[3] to 1, a[4] to 0 and a[5] to 0 +; 256 is 0x100 in hexadecimal. +; CHECK-NEXT: [[insert3:%.*]] = or i56 [[mask3]], 256 + %a2i40ptr = bitcast i8* %a2ptr to i40* store i40 1, i40* %a2i40ptr +; Constant is 0x100 0000 0000 in hexadecimal. +; set a[6] to 1, and overwrite a[2] to a[5] as 0. +; CHECK-NEXT: [[mask2:%.*]] = and i56 [[insert3]], -1099511627776 +; CHECK-NEXT: [[insert4:%.*]] = or i56 [[mask2]], 1 -; the alloca is splitted into multiple slices -; Here, i8 1 is for %a[6] -; CHECK: %[[ext1:.*]] = zext i8 1 to i40 -; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, -256 -; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], %[[ext1]] +; the alloca is rewritten into i56 -; Here, i24 0 is for %a[3] to %a[5] -; CHECK-NEXT: %[[ext2:.*]] = zext i24 0 to i40 -; CHECK-NEXT: %[[shift2:.*]] = shl i40 %[[ext2]], 8 -; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041 -; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], %[[shift2]] -; Here, i8 0 is for %a[2] -; CHECK-NEXT: %[[ext3:.*]] = zext i8 0 to i40 -; CHECK-NEXT: %[[shift3:.*]] = shl i40 %[[ext3]], 32 -; CHECK-NEXT: %[[mask3:.*]] = and i40 %[[insert2]], 4294967295 -; CHECK-NEXT: %[[insert3:.*]] = or i40 %[[mask3]], %[[shift3]] - -; CHECK-NEXT: %[[ext4:.*]] = zext i40 %[[insert3]] to i56 -; CHECK-NEXT: %[[mask4:.*]] = and i56 undef, -1099511627776 -; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[ext4]] - -; CHECK-NOT: store -; CHECK-NOT: load %aiptr = bitcast [7 x i8]* %a to i56* %ai = load i56, i56* %aiptr %ret = zext i56 %ai to i64 ret i64 %ret -; Here, i16 1 is for %a[0] to %a[1] -; CHECK-NEXT: %[[ext5:.*]] = zext i16 1 to i56 -; CHECK-NEXT: %[[shift5:.*]] = shl i56 %[[ext5]], 40 -; CHECK-NEXT: %[[mask5:.*]] = and i56 %[[insert4]], 1099511627775 -; CHECK-NEXT: %[[insert5:.*]] = or i56 %[[mask5]], %[[shift5]] -; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert5]] to i64 -; CHECK-NEXT: ret i64 %[[ret]] +; CHECK-NEXT: [[RET:%.*]] = zext i56 [[insert4]] to i64 +; CHECK-NEXT: ret i64 [[RET]] } define i64 @PR14132(i1 %flag) {