Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -118,8 +118,6 @@ bool run(); private: - GetElementPtrInst *getSourceGEP(Value *Src) const; - unsigned getPointerAddressSpace(Value *I); unsigned getAlignment(LoadInst *LI) const { @@ -139,6 +137,8 @@ } bool isConsecutiveAccess(Value *A, Value *B); + bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size); + bool tryHarder(Value *PtrA, Value *PtrB, APInt PtrDelta); /// After vectorization, reorder the instructions that I depends on /// (the instructions defining its operands), to ensure they dominate I. @@ -277,21 +277,6 @@ return -1; } -GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { - // First strip pointer bitcasts. Make sure pointee size is the same with - // and without casts. - // TODO: a stride set by the add instruction below can match the difference - // in pointee type size here. Currently it will not be vectorized. - Value *SrcPtr = getLoadStorePointerOperand(Src); - Value *SrcBase = SrcPtr->stripPointerCasts(); - Type *SrcPtrType = SrcPtr->getType()->getPointerElementType(); - Type *SrcBaseType = SrcBase->getType()->getPointerElementType(); - if (SrcPtrType->isSized() && SrcBaseType->isSized() && - DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType)) - SrcPtr = SrcBase; - return dyn_cast(SrcPtr); -} - // FIXME: Merge with llvm::isConsecutiveAccess bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getLoadStorePointerOperand(A); @@ -304,7 +289,6 @@ return false; // Make sure that A and B are different pointers of the same size type. - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *PtrATy = PtrA->getType()->getPointerElementType(); Type *PtrBTy = PtrB->getType()->getPointerElementType(); if (PtrA == PtrB || @@ -314,10 +298,16 @@ DL.getTypeStoreSize(PtrBTy->getScalarType())) return false; + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); - unsigned IdxWidth = DL.getIndexSizeInBits(ASA); - APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); + return areConsecutivePointers(PtrA, PtrB, Size); +} + +bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) { + unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); + APInt OffsetA(PtrBitWidth, 0); + APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); @@ -343,68 +333,95 @@ // Sometimes even this doesn't work, because SCEV can't always see through // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking // things the hard way. + return tryHarder(PtrA, PtrB, BaseDelta); +} + +bool Vectorizer::tryHarder(Value *PtrA, Value *PtrB, APInt PtrDelta) { + auto *GEPA = dyn_cast(PtrA); + auto *GEPB = dyn_cast(PtrB); + if (!GEPA || !GEPB) + return false; // Look through GEPs after checking they're the same except for the last // index. - GetElementPtrInst *GEPA = getSourceGEP(A); - GetElementPtrInst *GEPB = getSourceGEP(B); - if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) + if (GEPA->getNumOperands() != GEPB->getNumOperands() || + GEPA->getPointerOperand() != GEPB->getPointerOperand()) return false; - unsigned FinalIndex = GEPA->getNumOperands() - 1; - for (unsigned i = 0; i < FinalIndex; i++) - if (GEPA->getOperand(i) != GEPB->getOperand(i)) + gep_type_iterator GTIA = gep_type_begin(GEPA); + gep_type_iterator GTIB = gep_type_begin(GEPB); + for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { + if (GTIA.getOperand() != GTIB.getOperand()) return false; + ++GTIA; + ++GTIB; + } - Instruction *OpA = dyn_cast(GEPA->getOperand(FinalIndex)); - Instruction *OpB = dyn_cast(GEPB->getOperand(FinalIndex)); + Instruction *OpA = dyn_cast(GTIA.getOperand()); + Instruction *OpB = dyn_cast(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) return false; + if (PtrDelta.isNegative()) { + if (PtrDelta.isMinSignedValue()) + return false; + PtrDelta.negate(); + std::swap(OpA, OpB); + } + uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); + if (PtrDelta.urem(Stride) != 0) + return false; + unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); + APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); + // Only look through a ZExt/SExt. if (!isa(OpA) && !isa(OpA)) return false; bool Signed = isa(OpA); - OpA = dyn_cast(OpA->getOperand(0)); + // At this point A could be a function parameter, i.e. not an instruction + Value *ValA = OpA->getOperand(0); OpB = dyn_cast(OpB->getOperand(0)); - if (!OpA || !OpB || OpA->getType() != OpB->getType()) + if (!OpB || ValA->getType() != OpB->getType()) return false; - // Now we need to prove that adding 1 to OpA won't overflow. + // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; - // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, - // we're okay. + // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to + // ValA, we're okay. if (OpB->getOpcode() == Instruction::Add && - isa(OpB->getOperand(1)) && - cast(OpB->getOperand(1))->getSExtValue() > 0) { + ((isa(OpB->getOperand(0)) && + IdxDiff.sle(cast(OpB->getOperand(0))->getSExtValue())) || + (isa(OpB->getOperand(1)) && + IdxDiff.sle(cast(OpB->getOperand(1))->getSExtValue())))) { if (Signed) Safe = cast(OpB)->hasNoSignedWrap(); else Safe = cast(OpB)->hasNoUnsignedWrap(); } - unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); + unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Second attempt: - // If any bits are known to be zero other than the sign bit in OpA, we can - // add 1 to it while guaranteeing no overflow of any sort. + // If all set bits of IdxDiff or any higher order bit other than the sign bit + // are known to be zero in ValA, we can add Diff to it while guaranteeing no + // overflow of any sort. if (!Safe) { + OpA = dyn_cast(ValA); + if (!OpA) + return false; KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.countMaxTrailingOnes() < (BitWidth - 1)) - Safe = true; + if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff)) + return false; } - if (!Safe) - return false; - - const SCEV *OffsetSCEVA = SE.getSCEV(OpA); + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); - const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); - return X2 == OffsetSCEVB; + const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); + const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); + return X == OffsetSCEVB; } void Vectorizer::reorder(Instruction *I) { Index: test/CodeGen/X86/loadStore_vectorizer.ll =================================================================== --- test/CodeGen/X86/loadStore_vectorizer.ll +++ test/CodeGen/X86/loadStore_vectorizer.ll @@ -1,8 +1,9 @@ -; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s +; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s %struct_render_pipeline_state = type opaque -define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +; CHECK-LABEL: @test1 ; CHECK: load i16 ; CHECK: load i16 entry: @@ -14,3 +15,16 @@ %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 ret void } + +define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +; CHECK-LABEL: @test2 +; CHECK: load <2 x i16> +entry: + %tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)* + %tmp1 = load i16, i16 addrspace(1)* %tmp, align 2 + %tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)* + %sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2 + %tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)* + %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 + ret void +} Index: test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll @@ -56,10 +56,10 @@ ret void } -; TODO: This can be vectorized, but currently vectorizer unable to do it. ; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx +; CHECK: load <4 x i32> define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { - %add1 = add nuw i32 %base, 0 + %add1 = add i32 %base, 0 %zext1 = zext i32 %add1 to i64 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1 %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* @@ -74,10 +74,27 @@ %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 16 + %add4 = add nuw i32 %base, 12 %zext4 = zext i32 %add4 to i64 %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 ret void } + +; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta +; CHECK: load <2 x i32> +define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) { + %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)* + %a.offset = add nuw i32 4, %base + %t.offset.zexted = zext i32 %base to i64 + %a.offset.zexted = zext i32 %a.offset to i64 + %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted + %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted + %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6 + %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)* + %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)* + %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted + %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted + ret void +}