Index: lib/Transforms/InstCombine/InstCombine.h =================================================================== --- lib/Transforms/InstCombine/InstCombine.h +++ lib/Transforms/InstCombine/InstCombine.h @@ -169,6 +169,7 @@ Value *B, Value *C); Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A, Value *B, Value *C); + Instruction *FoldEndianIndependentLoad(BinaryOperator &I); Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" @@ -2028,6 +2029,226 @@ return nullptr; } +/// \brief Match a (possibly shifted) load, part of an endian-independent load +/// sequence. +/// When nothing is known, try to discover the root load pointer (\p LoadPtr), +/// and the data endianness of the load (\p IsLittleEndianLoad). +/// If the root load pointer is known (by then, the endianness has to be known +/// too), ensure that this (shifted?) load is consistent with them. +static bool MatchShiftedLoad(Value *V, Value *&LoadPtr, + bool &IsLittleEndianLoad, const DataLayout &DL, + SmallVectorImpl &ByteLoads) { + Value *LoadV = nullptr; + ConstantInt *BitOffset = nullptr; + bool IsShiftedLoad = true; + const size_t ResByteSize = ByteLoads.size(); + + // Try to match a shifted load: + // LE: (LoadPtr[I] << (I * 8)) + // BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8)) + if (!match(V, m_Shl(m_ZExt(m_Value(LoadV)), m_ConstantInt(BitOffset)))) { + // If not, match a non-shifted load. + if (match(V, m_ZExt(m_Value(LoadV)))) + IsShiftedLoad = false; + else + return false; + } + + // If we found a load pointer, we *have* to already know the endianness. + // If not, then we need to discover both the pointer and the endianness. + bool IsEndiannessKnown = (LoadPtr != nullptr); + auto IsLittleEndian = [&](bool LittleEndian) { + if (!IsEndiannessKnown) { + IsEndiannessKnown = true; + IsLittleEndianLoad = LittleEndian; + } + return IsLittleEndianLoad == LittleEndian; + }; + + LoadInst *LoadI = dyn_cast(LoadV); + if (!LoadI || !LoadI->isSimple() || + LoadI->getParent() != cast(V)->getParent() || + LoadI->getType() != Type::getInt8Ty(LoadI->getContext())) + return false; + + GetElementPtrInst *GEPI = + dyn_cast(LoadI->getPointerOperand()); + + // If we don't have a root load pointer, discover it here. + if (!LoadPtr) + LoadPtr = (GEPI ? GEPI->getPointerOperand() : LoadI->getPointerOperand()); + + // If we didn't find a GEP, this is a non-indexed load (LoadPtr[0]). + if (!GEPI) { + if (LoadPtr == LoadI->getPointerOperand()) { + // Match the non-shifted non-indexed LE load. + // LE: (LoadPtr[0]) + if (!IsShiftedLoad && IsLittleEndian(true)) + return ByteLoads[0] = LoadI; + + // Match the shifted non-indexed BE load. + // BE: (LoadPtr[0] << ((sizeof(EntireLoad) - 1) * 8)) + // equivalent to (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8)) + // when I == sizeof(EntireLoad) - 1 + if (IsShiftedLoad && + BitOffset->getValue().getZExtValue() == (ResByteSize - 1) * 8 && + IsLittleEndian(false)) { + return ByteLoads[0] = LoadI; + } + } + return false; + } + + const unsigned AS = GEPI->getPointerAddressSpace(); + const unsigned OffsetSizeInBits = DL.getPointerSizeInBits(AS); + + APInt Offset(OffsetSizeInBits, 0); + if (LoadPtr != GEPI->getPointerOperand() || + !GEPI->accumulateConstantOffset(DL, Offset)) + return false; + + const unsigned OffsetU = Offset.getZExtValue(); + if (OffsetU >= ResByteSize) + return false; + + // Match the non-shifted BE load. + // BE: (LoadPtr[sizeof(EntireLoad) - I - 1]) + if (!IsShiftedLoad) { + if (IsLittleEndian(false)) { + if (OffsetU != ResByteSize - 1) + return false; + return ByteLoads[ResByteSize - 1] = LoadI; + } + return false; + } + + // If IsShiftedLoad, BitOffset has already been matched and is non-null. + const unsigned BitOffsetU = BitOffset->getValue().getZExtValue(); + + // Match the general case, shifted indexed load. + // LE: (LoadPtr[I] << (I * 8) + if (OffsetU * 8 == BitOffsetU && IsLittleEndian(true)) + return ByteLoads[OffsetU] = LoadI; + // BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8)) + if ((ResByteSize - OffsetU - 1) * 8 == BitOffsetU && IsLittleEndian(false)) + return ByteLoads[ResByteSize - OffsetU - 1] = LoadI; + + return false; +} + +/// \brief Replace a bytewise endian-independent load sequence (shl'd and or'd +/// into a native-endian value) by a single native endian load, bswap'd if +/// the load sequence endianness is different from the native target's. +/// +/// For instance, a little-endian load on a little-endian target: +/// Having: i32 EntireLoad, i8* LoadPtr +/// Fold: +/// EntireLoad = +/// (LoadPtr[0] | LoadPtr[1] << 8 | LoadPtr[2] << 16 | LoadPtr[3] << 24) +/// +/// into: +/// EntireLoad = *((i32*)LoadPtr) +Instruction *InstCombiner::FoldEndianIndependentLoad(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *ResTy = I.getType(); + const int ResBitSize = ResTy->getPrimitiveSizeInBits(); + const int ResByteSize = ResTy->getPrimitiveSizeInBits() / 8; + + if (!DL || !ResTy->isIntegerTy() || (ResBitSize % 8) || ResBitSize < 16 || + !isPowerOf2_32(ResByteSize)) + return nullptr; + + // Keep track of the instruction that loads each byte. + SmallVector ByteLoads(ResByteSize); + // The pointer to the data loaded from. + Value *LoadPtr = nullptr; + // Whether the loaded data is little-endian. + bool IsLELoad = true; + // Whether the target data layout is little-endian. + const bool IsLETarget = DL->isLittleEndian(); + + // Try to match OR operands as (shifted?) loads. + // Keep track of the OR operands. + SmallVector ByteValueWorklist; + ByteValueWorklist.push_back(Op0); + ByteValueWorklist.push_back(Op1); + for (size_t i = 0; i < ByteValueWorklist.size(); ++i) { + Value *V = ByteValueWorklist[i]; + + // Try to match a (shifted?) load. + if (MatchShiftedLoad(V, LoadPtr, IsLELoad, *DL, ByteLoads)) + continue; + + // Try to match another OR. + Value *LHS = nullptr, *RHS = nullptr; + if (match(V, m_Or(m_Value(LHS), m_Value(RHS)))) { + ByteValueWorklist.push_back(LHS); + ByteValueWorklist.push_back(RHS); + } else { + return nullptr; + } + } + + std::sort(ByteLoads.begin(), ByteLoads.end()); + + // Check that we load all bytes. If we don't, there's one or more nullptr(s) + // in ByteLoads, which will be sorted to to the beginning. + if (!ByteLoads[0]) + return nullptr; + + AliasAnalysis *AA = getAnalysisIfAvailable(); + + // Now go back up until we encounter all loads. Check all stores in between. + // If we made it this far, we have a clean OR-tree representing an + // endian-independent load sequence. We can (somewhat) take our time, but + // add a max-scan safeguard just in case. + int NumLoadsEncountered = 0; + BasicBlock::iterator BBI = &I; + for (unsigned MaxScanInsts = ResByteSize * 5 + 50; + BBI != I.getParent()->begin() && MaxScanInsts; --MaxScanInsts) { + --BBI; + + // If this a store to an aliasing location, bail out. + if (StoreInst *SI = dyn_cast(BBI)) { + if (!SI->isSimple() || !AA || + AA->alias(SI->getPointerOperand(), AliasAnalysis::UnknownSize, + LoadPtr, ResByteSize)) + return nullptr; + } else if (BBI->mayWriteToMemory()) { + // If this is another kind of instruction, bail out. + return nullptr; + } + + // If we found a Load, count it. + if (isa(BBI)) + if (std::binary_search(ByteLoads.begin(), ByteLoads.end(), &*BBI)) + ++NumLoadsEncountered; + + // If we found all our loads, no need to continue. + if (NumLoadsEncountered == ResByteSize) + break; + } + + // If we didn't find all our loads yet, abort. + if (NumLoadsEncountered != ResByteSize) + return nullptr; + + const unsigned Alignment = ByteLoads[0]->getAlignment(); + const unsigned AS = cast(LoadPtr->getType())->getAddressSpace(); + Instruction *FoldedLoad = Builder->CreateAlignedLoad( + Builder->CreatePointerCast(LoadPtr, ResTy->getPointerTo(AS)), Alignment); + + // If the target and loaded data endianness are different, swap them after + // doing the target endian load. + if (IsLETarget != IsLELoad) { + Type *Tys[] = {FoldedLoad->getType()}; + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys); + FoldedLoad = Builder->CreateCall(TheFn, FoldedLoad); + } + return ReplaceInstUsesWith(I, FoldedLoad); +} + Instruction *InstCombiner::visitOr(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -2386,6 +2607,10 @@ } } + if (Instruction *FoldedLoad = FoldEndianIndependentLoad(I)) { + return FoldedLoad; + } + return Changed ? &I : nullptr; } Index: test/Transforms/InstCombine/endian-independent-load-BE.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/endian-independent-load-BE.ll @@ -0,0 +1,167 @@ +; RUN: opt %s -instcombine -S | FileCheck %s +target datalayout = "E-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: @test_BE_loadBE32 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: ret i32 %2 +define i32 @test_BE_loadBE32(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8* %buffer, i64 1 + %5 = load i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 16 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8* %buffer, i64 2 + %10 = load i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 8 + %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8* %buffer, i64 3 + %15 = load i8* %14, align 1 + %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 + ret i32 %17 +} + +; CHECK-LABEL: @test_BE_loadBE32_with_addrspace +; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)* +; CHECK: %2 = load i32 addrspace(5)* %1, align 1 +; CHECK: ret i32 %2 +define i32 @test_BE_loadBE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) { + %1 = load i8 addrspace(5)* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1 + %5 = load i8 addrspace(5)* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 16 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2 + %10 = load i8 addrspace(5)* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 8 + %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3 + %15 = load i8 addrspace(5)* %14, align 1 + %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 + ret i32 %17 +} + +; CHECK-LABEL: @test_BE_loadLE32 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2) +; CHECK: ret i32 %3 +define i32 @test_BE_loadLE32(i8* nocapture readonly %buffer) #0 { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + ret i32 %17 +} + +declare void @load_buffer(i8*, i32*) + +; CHECK-LABEL: @test_BE_loadBE32_align4 +; CHECK: %buffer_i = alloca i32, align 4 +; CHECK: %buffer = bitcast i32* %buffer_i to i8* +; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int) +; CHECK: %1 = load i32* %buffer_i, align 4 +; CHECK: ret i32 %1 +define i32 @test_BE_loadBE32_align4(i32* %buffer_int) { + %buffer_i = alloca i32, align 4 + %buffer = bitcast i32* %buffer_i to i8* + call void @load_buffer(i8* %buffer, i32* %buffer_int) + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8* %buffer, i64 1 + %5 = load i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 16 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8* %buffer, i64 2 + %10 = load i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 8 + %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8* %buffer, i64 3 + %15 = load i8* %14, align 1 + %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 + ret i32 %17 +} + +; No change expected +; CHECK-LABEL: @test_BE_broken_sparse +define i32 @test_BE_broken_sparse(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8* %buffer, i64 1 + %5 = load i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 16 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8* %buffer, i64 2 + %10 = load i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 8 + %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8* %buffer, i64 4 +; CHECK: %14 = getelementptr inbounds i8* %buffer, i64 4 + %15 = load i8* %14, align 1 +; CHECK: %15 = load i8* %14, align 1 + %16 = zext i8 %15 to i32 +; CHECK: %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 +; CHECK: %17 = or i32 %13, %16 + ret i32 %17 +; CHECK: ret i32 %17 +} +;; +; No change expected +; CHECK-LABEL: @test_BE_broken_shift +define i32 @test_BE_broken_shift(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8* %buffer, i64 1 + %5 = load i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 8 +; CHECK: %7 = shl nuw nsw i32 %6, 8 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8* %buffer, i64 2 + %10 = load i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 16 +; CHECK: %12 = shl nuw nsw i32 %11, 16 + %13 = or i32 %8, %12 +; CHECK: %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8* %buffer, i64 3 + %15 = load i8* %14, align 1 + %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 +; CHECK: %17 = or i32 %13, %16 + ret i32 %17 +; CHECK: ret i32 %17 +} Index: test/Transforms/InstCombine/endian-independent-load-LE-aliasing.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/endian-independent-load-LE-aliasing.ll @@ -0,0 +1,155 @@ +; RUN: opt %s -basicaa -instcombine -S | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: @test_LE_load_noalias_allocastore +; CHECK: %c0b = load i8* %buffer, align 1 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: %3 = add i32 %2, %c42 +; CHECK: ret i32 %3 +define i32 @test_LE_load_noalias_allocastore(i8* nocapture %buffer) { + %locbuf = alloca i32, align 4 + %c0b = load i8* %buffer, align 1 + %c0 = zext i8 %c0b to i32 + store i32 %c0, i32* %locbuf + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %c0l = load i32* %locbuf + %c42 = add i32 %c0l, 42 + store i32 %c42, i32* %locbuf + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + %c42l = load i32* %locbuf + %18 = add i32 %17, %c42l + ret i32 %18 +} + +; CHECK-LABEL: @test_LE_load_noalias_offsetstore +; CHECK: %buf_off = getelementptr inbounds i8* %buffer, i64 4 +; CHECK: %c0l = load i8* %buf_off, align 1 +; CHECK: %c42 = add i8 %c0l, 42 +; CHECK: store i8 %c42, i8* %buf_off, align 1 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: %c42z = zext i8 %c42 to i32 +; CHECK: %3 = add i32 %2, %c42z +; CHECK: ret i32 %3 +define i32 @test_LE_load_noalias_offsetstore(i8* nocapture %buffer) { + %locbuf = alloca i32, align 4 + %c0b = load i8* %buffer, align 1 + %c0 = zext i8 %c0b to i32 + store i32 %c0, i32* %locbuf + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %buf_off = getelementptr inbounds i8* %buffer, i64 4 + %c0l = load i8* %buf_off + %c42 = add i8 %c0l, 42 + store i8 %c42, i8* %buf_off + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + %c42l = load i8* %buf_off + %c42z = zext i8 %c42l to i32 + %18 = add i32 %17, %c42z + ret i32 %18 +} + +; CHECK-LABEL: @test_LE_load_alias_store +define i32 @test_LE_load_alias_store(i8* nocapture %buffer) { + %1 = load i8* %buffer, align 1 +; CHECK: %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 +; CHECK: %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 +; CHECK: %9 = load i8* %8, align 1 + %buf_off = getelementptr inbounds i8* %buffer, i64 2 + %c0l = load i8* %buf_off + %c42 = add i8 %c0l, 42 + store i8 %c42, i8* %buf_off +; CHECK: store i8 %c42, i8* %buf_off + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 +; CHECK: %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + %c42l = load i8* %buf_off +; CHECK: %c42l = load i8* %buf_off + %c42z = zext i8 %c42l to i32 + %18 = add i32 %17, %c42z + ret i32 %18 +; CHECK: ret i32 %18 +} + +; CHECK-LABEL: @test_LE_load_mayalias_store +define i32 @test_LE_load_mayalias_store(i8* nocapture %buffer, i8* nocapture %othermem) { + %1 = load i8* %buffer, align 1 +; CHECK: %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 +; CHECK: %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 +; CHECK: %9 = load i8* %8, align 1 + %buf_off = getelementptr inbounds i8* %othermem, i64 2 + %c0l = load i8* %buf_off +; CHECK: %c0l = load i8* %buf_off + %c42 = add i8 %c0l, 42 + store i8 %c42, i8* %buf_off +; CHECK: store i8 %c42, i8* %buf_off + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 +; CHECK: %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 +; CHECK: %17 = or i32 %12, %16 + %c42l = load i8* %buf_off +; CHECK: %c42l = load i8* %buf_off + %c42z = zext i8 %c42l to i32 + %18 = add i32 %17, %c42z + ret i32 %18 +; CHECK: ret i32 %18 +} Index: test/Transforms/InstCombine/endian-independent-load-LE.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/endian-independent-load-LE.ll @@ -0,0 +1,262 @@ +; RUN: opt %s -instcombine -S | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: @test_LE_loadLE32 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: ret i32 %2 +define i32 @test_LE_loadLE32(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + ret i32 %17 +} + +; CHECK-LABEL: @test_LE_loadLE64 +; CHECK: %1 = bitcast i8* %buffer to i64* +; CHECK: %2 = load i64* %1, align 1 +; CHECK: ret i64 %2 +define i64 @test_LE_loadLE64(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i64 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i64 + %6 = shl nuw nsw i64 %5, 8 + %7 = or i64 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i64 + %11 = shl nuw nsw i64 %10, 16 + %12 = or i64 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i64 + %16 = shl nuw nsw i64 %15, 24 + %17 = or i64 %12, %16 + %18 = getelementptr inbounds i8* %buffer, i64 4 + %19 = load i8* %18, align 1 + %20 = zext i8 %19 to i64 + %21 = shl nuw nsw i64 %20, 32 + %22 = or i64 %17, %21 + %23 = getelementptr inbounds i8* %buffer, i64 5 + %24 = load i8* %23, align 1 + %25 = zext i8 %24 to i64 + %26 = shl nuw nsw i64 %25, 40 + %27 = or i64 %22, %26 + %28 = getelementptr inbounds i8* %buffer, i64 6 + %29 = load i8* %28, align 1 + %30 = zext i8 %29 to i64 + %31 = shl nuw nsw i64 %30, 48 + %32 = or i64 %27, %31 + %33 = getelementptr inbounds i8* %buffer, i64 7 + %34 = load i8* %33, align 1 + %35 = zext i8 %34 to i64 + %36 = shl nuw i64 %35, 56 + %37 = or i64 %32, %36 + ret i64 %37 +} + +; CHECK-LABEL: @test_LE_loadLE32_permuted +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: ret i32 %2 +define i32 @test_LE_loadLE32_permuted(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = getelementptr inbounds i8* %buffer, i64 2 + %8 = load i8* %7, align 1 + %9 = zext i8 %8 to i32 + %10 = shl nuw nsw i32 %9, 16 + %11 = getelementptr inbounds i8* %buffer, i64 3 + %12 = load i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw i32 %13, 24 + %15 = or i32 %6, %2 + %16 = or i32 %14, %10 + %17 = or i32 %16, %15 + ret i32 %17 +} + +; CHECK-LABEL: @test_LE_loadLE32_with_addrspace +; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)* +; CHECK: %2 = load i32 addrspace(5)* %1, align 1 +; CHECK: ret i32 %2 +define i32 @test_LE_loadLE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) { + %1 = load i8 addrspace(5)* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1 + %4 = load i8 addrspace(5)* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2 + %9 = load i8 addrspace(5)* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3 + %14 = load i8 addrspace(5)* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + ret i32 %17 +} + +; CHECK-LABEL: @test_LE_loadBE32 +; CHECK: %1 = bitcast i8* %buffer to i32* +; CHECK: %2 = load i32* %1, align 1 +; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2) +; CHECK: ret i32 %3 +define i32 @test_LE_loadBE32(i8* nocapture readonly %buffer) #0 { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = shl nuw i32 %2, 24 + %4 = getelementptr inbounds i8* %buffer, i64 1 + %5 = load i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 16 + %8 = or i32 %7, %3 + %9 = getelementptr inbounds i8* %buffer, i64 2 + %10 = load i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 8 + %13 = or i32 %8, %12 + %14 = getelementptr inbounds i8* %buffer, i64 3 + %15 = load i8* %14, align 1 + %16 = zext i8 %15 to i32 + %17 = or i32 %13, %16 + ret i32 %17 +} + +declare void @load_buffer(i8*, i32*) + +; CHECK-LABEL: @test_LE_loadLE32_align4 +; CHECK: %buffer_i = alloca i32, align 4 +; CHECK: %buffer = bitcast i32* %buffer_i to i8* +; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int) +; CHECK: %1 = load i32* %buffer_i, align 4 +; CHECK: ret i32 %1 +define i32 @test_LE_loadLE32_align4(i32* %buffer_int) { + %buffer_i = alloca i32, align 4 + %buffer = bitcast i32* %buffer_i to i8* + call void @load_buffer(i8* %buffer, i32* %buffer_int) + %1 = load i8* %buffer, align 4 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 + ret i32 %17 +} + +; No change expected +; CHECK-LABEL: @test_LE_broken_sparse +define i32 @test_LE_broken_sparse(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 4 +; CHECK: %13 = getelementptr inbounds i8* %buffer, i64 4 + %14 = load i8* %13, align 1 +; CHECK: %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 +; CHECK: %15 = zext i8 %14 to i32 + %16 = shl nuw i32 %15, 24 +; CHECK: %16 = shl nuw i32 %15, 24 + %17 = or i32 %12, %16 +; CHECK: %17 = or i32 %12, %16 + ret i32 %17 +; CHECK: ret i32 %17 +} + +; No change expected +; CHECK-LABEL: @test_LE_broken_shift +define i32 @test_LE_broken_shift(i8* nocapture readonly %buffer) { + %1 = load i8* %buffer, align 1 + %2 = zext i8 %1 to i32 + %3 = getelementptr inbounds i8* %buffer, i64 1 + %4 = load i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 8 + %7 = or i32 %6, %2 + %8 = getelementptr inbounds i8* %buffer, i64 2 + %9 = load i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 24 +; CHECK: %11 = shl nuw nsw i32 %10, 24 + %12 = or i32 %7, %11 + %13 = getelementptr inbounds i8* %buffer, i64 3 + %14 = load i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw nsw i32 %15, 16 +; CHECK: %16 = shl nuw nsw i32 %15, 16 + %17 = or i32 %12, %16 +; CHECK: %17 = or i32 %12, %16 + ret i32 %17 +; CHECK: ret i32 %17 +} + +; No change expected +; CHECK-LABEL: @test_LE_broken_load +define i32 @test_LE_broken_load(i8** %bits) { + %bits_ptr = load i8** %bits, align 8 +; CHECK: %bits_ptr = load i8** %bits, align 8 + %1 = load i8* %bits_ptr, align 1 + %conv = zext i8 %1 to i32 + %shl = shl nuw nsw i32 %conv, 16 + %arrayidx2 = getelementptr inbounds i8* %bits_ptr, i64 1 + %2 = load i8* %arrayidx2, align 1 + %conv3 = zext i8 %2 to i32 + %shl4 = shl nuw nsw i32 %conv3, 8 + %or = or i32 %shl, %shl4 + %arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2 +; CHECK: %arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2 + %3 = load i8* %arrayidx6, align 1 +; CHECK: %3 = load i8* %arrayidx6, align 1 + %conv7 = zext i8 %3 to i32 +; CHECK: %conv7 = zext i8 %3 to i32 + %or8 = or i32 %or, %conv7 +; CHECK: %or8 = or i32 %or, %conv7 + ret i32 %or8 +; CHECK: ret i32 %or8 +}