Index: include/llvm/Analysis/Loads.h =================================================================== --- include/llvm/Analysis/Loads.h +++ include/llvm/Analysis/Loads.h @@ -58,6 +58,44 @@ /// to scan in the block, used by FindAvailableLoadedValue(). extern cl::opt DefMaxInstsToScan; +/// \brief Scan backwards to see if we have the value of type \p AccessTy +/// at the memory address \p Ptr locally available within a +/// small number of instructions. If the value is available, return it. +/// +/// You can use this function to scan across multiple blocks: after you call +/// this function, if ScanFrom points at the beginning of the block, it's safe +/// to continue scanning the predecessors. +/// Note that we assume the \p *Ptr is accessed through a non-volatile but +/// potentially atomic load. Any other constraints should be verified at the +/// caller. +/// +/// \param Ptr The memory location whose contents we are retrieving +/// \param AccessTy The type (and size) of the contents we need from \p Ptr +/// \param IsAtomicMemOp specifies the atomicity of the memory operation that accesses +/// \p *Ptr. We verify atomicity constraints are satisfied when value forwarding +/// from another memory operation that has value \p *Ptr available. +/// \param ScanBB The basic block to scan. FIXME: This is redundant. +/// \param [in,out] ScanFrom The location to start scanning from. When this +/// function returns, it points at the last instruction scanned. +/// \param MaxInstsToScan The maximum number of instructions to scan. If this +/// is zero, the whole block will be scanned. +/// \param AA Optional pointer to alias analysis, to make the scan more +/// precise. +/// \param [out] AATags The aliasing metadata for the operation which produced +/// the value. FIXME: This is basically useless. +/// +/// \param [out] IsLoadCSE Whether the returned value is a load from the same +/// location in memory, as opposed to the value operand of a store. +/// +/// \returns The found value, or nullptr if no value is found. +Value *FindAvailableLoadedValue(Value *Ptr, Type *AccessTy, bool IsAtomicMemOp, + BasicBlock *ScanBB, + BasicBlock::iterator &ScanFrom, + unsigned MaxInstsToScan, + AliasAnalysis *AA = nullptr, + AAMDNodes *AATags = nullptr, + bool *IsLoadCSE = nullptr); + /// \brief Scan backwards to see if we have the value of the given load /// available locally within a small number of instructions. /// Index: lib/Analysis/Loads.cpp =================================================================== --- lib/Analysis/Loads.cpp +++ lib/Analysis/Loads.cpp @@ -302,25 +302,15 @@ "to scan backward from a given instruction, when searching for " "available loaded value")); -Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, +Value *llvm::FindAvailableLoadedValue(Value *Ptr, Type *AccessTy, + bool IsAtomicMemOp, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan, AliasAnalysis *AA, AAMDNodes *AATags, bool *IsLoadCSE) { + if (MaxInstsToScan == 0) MaxInstsToScan = ~0U; - - Value *Ptr = Load->getPointerOperand(); - Type *AccessTy = Load->getType(); - - // We can never remove a volatile load - if (Load->isVolatile()) - return nullptr; - - // Anything stronger than unordered is currently unimplemented. - if (!Load->isUnordered()) - return nullptr; - const DataLayout &DL = ScanBB->getModule()->getDataLayout(); // Try to get the store size for the type. @@ -353,7 +343,7 @@ // We can value forward from an atomic to a non-atomic, but not the // other way around. - if (LI->isAtomic() < Load->isAtomic()) + if (LI->isAtomic() < IsAtomicMemOp) return nullptr; if (AATags) @@ -374,7 +364,7 @@ // We can value forward from an atomic to a non-atomic, but not the // other way around. - if (SI->isAtomic() < Load->isAtomic()) + if (SI->isAtomic() < IsAtomicMemOp) return nullptr; if (AATags) @@ -418,3 +408,24 @@ // block. return nullptr; } + + +Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, + BasicBlock::iterator &ScanFrom, + unsigned MaxInstsToScan, + AliasAnalysis *AA, AAMDNodes *AATags, + bool *IsLoadCSE) { + + // We can never remove a volatile load + if (Load->isVolatile()) + return nullptr; + + // Anything stronger than unordered is currently unimplemented. + if (!Load->isUnordered()) + return nullptr; + + // Return the full value of the load if available. + return FindAvailableLoadedValue(Load->getPointerOperand(), Load->getType(), + Load->isAtomic(), ScanBB, ScanFrom, + MaxInstsToScan, AA, AATags, IsLoadCSE); +} Index: lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCasts.cpp +++ lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -13,9 +13,10 @@ #include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -575,6 +576,36 @@ if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL)) return I; + // When trunc operand is a widened load, see if we can get the value from a + // previous 'narrow' store/load. This involves extracting partial memory + // contents from the widened load, at the correct offset depending on the + // memory layout. + + // TODO: Currently, this rule works on Little Endian layout only. To get the + // correct content from the previous 'narrow' store/load in Big Endian mode, + // we will need to confirm the load is from the correct offset (i.e. pass the + // offset along with DestTy in \FindAvailableLoadedValue). + + if (DL.isLittleEndian()) + if (auto *LI = dyn_cast(Src)) { + BasicBlock::iterator BBI(*LI); + + // Scan a few instructions up from LI and if we find a partial load/store + // of Type DestTy that feeds into LI, we can replace all uses of the trunc + // with the load/store value. This replacement can be done only in the + // case of non-volatile loads, with ordering at most unordered. If the + // load is atomic, its only use should be the trunc instruction. We don't + // want to allow other users of LI to see a value that is out of sync with + // the value we're folding the trunc to (in case of a race). + if (LI->isUnordered() && (!LI->isAtomic() || LI->hasOneUse())) + if (Value *AvailableVal = FindAvailableLoadedValue( + LI->getPointerOperand(), DestTy, LI->isAtomic(), + LI->getParent(), BBI, DefMaxInstsToScan)) + return replaceInstUsesWith( + CI, Builder->CreateBitOrPointerCast(AvailableVal, CI.getType(), + CI.getName() + ".cast")); + } + return nullptr; } Index: test/Transforms/InstCombine/trunc.ll =================================================================== --- test/Transforms/InstCombine/trunc.ll +++ test/Transforms/InstCombine/trunc.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -instcombine -S | FileCheck %s +; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -instcombine -S | FileCheck %s --check-prefix=BE ; Instcombine should be able to eliminate all of these ext casts. @@ -181,3 +181,164 @@ bb2: unreachable } + + +; the trunc can be replaced from value available from store +; load feeding into trunc left as-is. +; FIXME: This transformation is enabled currently only for Little Endian. +; In BE mode: We cannot replace the trunc with 0, since zero is in +; the high half of %wide.load (which we get by shr of +; %wide.load followed by trunc). +declare void @consume(i8) readonly +define i1 @trunc_load_store(i8* align 2 %a) { + store i8 0, i8 *%a, align 2 + %bca = bitcast i8* %a to i16* + %wide.load = load i16, i16* %bca, align 2 + %lowhalf.1 = trunc i16 %wide.load to i8 + call void @consume(i8 %lowhalf.1) + %cmp.2 = icmp ult i16 %wide.load, 256 + ret i1 %cmp.2 +; CHECK-LABEL: @trunc_load_store +; CHECK: %wide.load = load i16, i16* %bca, align 2 +; CHECK-NOT: trunc +; CHECK: call void @consume(i8 0) +; BE-LABEL: @trunc_load_store +; BE: trunc +; BE: call void @consume(i8 %lowhalf.1) +} + + +; The trunc can be replaced with the load value. +; both loads left as-is, since they have uses. +; FIXME: This is done currently only for LE layouts. +define i1 @trunc_load_load(i8* align 2 %a) { + %pload = load i8, i8* %a, align 2 + %bca = bitcast i8* %a to i16* + %wide.load = load i16, i16* %bca, align 2 + %lowhalf = trunc i16 %wide.load to i8 + call void @consume(i8 %lowhalf) + call void @consume(i8 %pload) + %cmp.2 = icmp ult i16 %wide.load, 256 + ret i1 %cmp.2 +; CHECK-LABEL: @trunc_load_load +; CHECK-NEXT: %pload = load i8, i8* %a, align 2 +; CHECK-NEXT: %bca = bitcast i8* %a to i16* +; CHECK-NEXT: %wide.load = load i16, i16* %bca, align 2 +; CHECK-NEXT: call void @consume(i8 %pload) +; CHECK-NEXT: call void @consume(i8 %pload) +; CHECK-NEXT: %cmp.2 = icmp ult i16 %wide.load, 256 + +; BE-LABEL: @trunc_load_load +; BE: trunc +; BE: call void @consume(i8 %lowhalf) +; BE: call void @consume(i8 %pload) +} + +; Store and load of same size and to same memory location address generated through GEP. +; trunc can be removed by using the store value. +; NOTE: We can make this transformation in LE and BE since load and store are of same size (i16) +; and to the same memory location. +define void @trunc_with_gep_memaccess(i16* align 2 %p) { + %t0 = getelementptr i16, i16* %p, i32 1 + store i16 2, i16* %t0 + %t1 = getelementptr i16, i16* %p, i32 1 + %x = load i16, i16* %t1 + %lowhalf = trunc i16 %x to i8 + call void @consume(i8 %lowhalf) + ret void +; CHECK-LABEL: @trunc_with_gep_memaccess +; CHECK-NOT: trunc +; CHECK: call void @consume(i8 2) + +; BE-LABEL: @trunc_with_gep_memaccess +; BE-NOT: trunc +; BE: call void @consume(i8 2) +} + +; trunc should not be replaced since atomic load %wide.load has more than one use. +; different values can be seen by the uses of %wide.load in case of race. +define i1 @trunc_atomic_loads(i8* align 2 %a) { + %pload = load atomic i8, i8* %a unordered, align 2 + %bca = bitcast i8* %a to i16* + %wide.load = load atomic i16, i16* %bca unordered, align 2 + %lowhalf = trunc i16 %wide.load to i8 + call void @consume(i8 %lowhalf) + call void @consume(i8 %pload) + %cmp.2 = icmp ult i16 %wide.load, 256 + ret i1 %cmp.2 +; CHECK-LABEL: @trunc_atomic_loads +; CHECK: trunc + +; BE-LABEL: @trunc_atomic_loads +; BE: trunc +} + +; trunc can be replaced since atomic load has single use. +; atomic load is also removed since use is removed. +; FIXME: This is done currently only for LE layouts. +define void @trunc_atomic_single_load(i8* align 2 %a) { + %pload = load atomic i8, i8* %a unordered, align 2 + %bca = bitcast i8* %a to i16* + %wide.load = load atomic i16, i16* %bca unordered, align 2 + %lowhalf = trunc i16 %wide.load to i8 + call void @consume(i8 %lowhalf) + call void @consume(i8 %pload) + ret void +; CHECK-LABEL: @trunc_atomic_single_load +; CHECK-NOT: %wide.load = load atomic i16, i16* %bca unordered, align 2 +; CHECK-NOT: trunc + +; BE-LABEL: @trunc_atomic_single_load +; BE: trunc +} + + +; trunc cannot be replaced since load's atomic ordering is higher than unordered +define void @trunc_atomic_monotonic(i8* align 2 %a) { + %pload = load atomic i8, i8* %a monotonic, align 2 + %bca = bitcast i8* %a to i16* + %wide.load = load atomic i16, i16* %bca monotonic, align 2 + %lowhalf = trunc i16 %wide.load to i8 + call void @consume(i8 %lowhalf) + call void @consume(i8 %pload) + ret void +; CHECK-LABEL: @trunc_atomic_monotonic +; CHECK: %wide.load = load atomic i16, i16* %bca monotonic, align 2 +; CHECK: trunc +} + +; trunc cannot be replaced since store size (i16) is not trunc result size (i8). +; FIXME: we could get the i8 content of trunc from the i16 store value. +define i1 @trunc_different_size_load(i16 * align 2 %a) { + store i16 0, i16 *%a, align 2 + %bca = bitcast i16* %a to i32* + %wide.load = load i32, i32* %bca, align 2 + %lowhalf = trunc i32 %wide.load to i8 + call void @consume(i8 %lowhalf) + %cmp.2 = icmp ult i32 %wide.load, 256 + ret i1 %cmp.2 +; CHECK-LABEL: @trunc_different_size_load +; CHECK: %lowhalf = trunc i32 %wide.load to i8 +} + +declare void @consume_f(float) readonly +; bitcast required since trunc result type and %fload are different types. +; so replace the trunc with bitcast. +; FIXME: This is done currently only for LE layouts. +define i1 @trunc_avoid_bitcast(float* %b) { + %fload = load float, float* %b + %bca = bitcast float* %b to i64* + %iload = load i64, i64* %bca + %low32 = trunc i64 %iload to i32 + call void @consume_f(float %fload) + %cmp.2 = icmp ult i32 %low32, 256 + ret i1 %cmp.2 +; CHECK-LABEL: @trunc_avoid_bitcast +; CHECK-NOT: %low32 = trunc i64 %iload to i32 +; CHECK: %low32.cast = bitcast float %fload to i32 +; CHECK: %cmp.2 = icmp ult i32 %low32.cast, 256 + +; BE-LABEL: @trunc_avoid_bitcast +; BE: %low32 = trunc i64 %iload to i32 +; BE: %cmp.2 = icmp ult i32 %low32, 256 +}