diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -121,15 +121,15 @@ if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT)) return false; + unsigned AS = Load->getPointerAddressSpace(); + // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0 - int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, - Load->getPointerAddressSpace()); + int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS); APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0); OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false); // New pattern: load VecPtr - int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, - Load->getPointerAddressSpace()); + int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -139,7 +139,7 @@ // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); - Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo()); + Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS)); LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); replaceValue(I, *VecLd); ++NumVecLoad; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -234,6 +234,19 @@ ret <4 x float> %r } +; Should work with addrspace as well. + +define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) { +; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( +; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 + %s = load float, float addrspace(44)* %gep, align 16 + %r = insertelement <4 x float> undef, float %s, i64 0 + ret <4 x float> %r +} + ; If there are enough dereferenceable bytes, we can offset the vector load. define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {