Index: llvm/lib/Transforms/Scalar/SROA.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SROA.cpp +++ llvm/lib/Transforms/Scalar/SROA.cpp @@ -786,10 +786,6 @@ if (!IsOffsetKnown) return PI.setAborted(&LI); - if (LI.isVolatile() && - LI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&LI); - uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize(); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } @@ -801,10 +797,6 @@ if (!IsOffsetKnown) return PI.setAborted(&SI); - if (SI.isVolatile() && - SI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&SI); - uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize(); // If this memory access can be shown to *statically* extend outside the @@ -839,11 +831,6 @@ if (!IsOffsetKnown) return PI.setAborted(&II); - // Don't replace this with a store with a different address space. TODO: - // Use a store with the casted new alloca? - if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&II); - insertUse(II, Offset, Length ? Length->getLimitedValue() : AllocSize - Offset.getLimitedValue(), (bool)Length); @@ -863,13 +850,6 @@ if (!IsOffsetKnown) return PI.setAborted(&II); - // Don't replace this with a load/store with a different address space. - // TODO: Use a store with the casted new alloca? - if (II.isVolatile() && - (II.getDestAddressSpace() != DL.getAllocaAddrSpace() || - II.getSourceAddressSpace() != DL.getAllocaAddrSpace())) - return PI.setAborted(&II); - // This side of the transfer is completely out-of-bounds, and so we can // nuke the entire transfer. However, we also need to nuke the other side // if already added to our partitions. @@ -2326,6 +2306,16 @@ // the insertion point is set to point to the user. IRBuilderTy IRB; + // Return the new alloca, addrspacecasted if required to avoid changing the + // addrspace of a volatile access. + Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) { + if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace()) + return &NewAI; + + Type *AccessTy = NewAI.getAllocatedType()->getPointerTo(AddrSpace); + return IRB.CreateAddrSpaceCast(&NewAI, AccessTy); + } + public: AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, @@ -2529,7 +2519,9 @@ (canConvertValue(DL, NewAllocaTy, TargetTy) || (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy()))) { - LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, + Value *NewPtr = + getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); + LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName()); if (AATags) @@ -2716,8 +2708,11 @@ } V = convertValue(DL, IRB, V, NewAllocaTy); + Value *NewPtr = + getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); + NewSI = - IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile()); + IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile()); } else { unsigned AS = SI.getPointerAddressSpace(); Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS)); @@ -2887,8 +2882,9 @@ V = convertValue(DL, IRB, V, AllocaTy); } + Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); StoreInst *New = - IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile()); + IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile()); if (AATags) New->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); @@ -3040,14 +3036,22 @@ } OtherPtrTy = OtherTy->getPointerTo(OtherAS); - Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); MaybeAlign SrcAlign = OtherAlign; - Value *DstPtr = &NewAI; MaybeAlign DstAlign = SliceAlign; - if (!IsDest) { - std::swap(SrcPtr, DstPtr); + if (!IsDest) std::swap(SrcAlign, DstAlign); + + Value *SrcPtr; + Value *DstPtr; + + if (IsDest) { + DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); + SrcPtr = AdjPtr; + } else { + DstPtr = AdjPtr; + SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile()); } Value *Src; Index: llvm/test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -129,6 +129,7 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + store volatile i32* %fptr, i32* addrspace(3)* undef %ld = load volatile i32, i32* %fptr, align 1 ret void } @@ -141,6 +142,7 @@ define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + store volatile i32* %fptr, i32* addrspace(3)* undef store volatile i32 0, i32* %fptr, align 1 ret void } Index: llvm/test/Transforms/SROA/addrspacecast.ll =================================================================== --- llvm/test/Transforms/SROA/addrspacecast.ll +++ llvm/test/Transforms/SROA/addrspacecast.ll @@ -172,12 +172,11 @@ define i64 @alloca_addrspacecast_bitcast_volatile_store(i64 %X) { ; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8] -; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast [8 x i8]* [[A]] to [8 x i8] addrspace(1)* -; CHECK-NEXT: [[B:%.*]] = bitcast [8 x i8] addrspace(1)* [[A_CAST]] to i64 addrspace(1)* -; CHECK-NEXT: store volatile i64 [[X:%.*]], i64 addrspace(1)* [[B]] -; CHECK-NEXT: [[Z:%.*]] = load i64, i64 addrspace(1)* [[B]] -; CHECK-NEXT: ret i64 [[Z]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast i64* [[A_SROA_0]] to i64 addrspace(1)* +; CHECK-NEXT: store volatile i64 [[X:%.*]], i64 addrspace(1)* [[TMP0]] +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load i64, i64* [[A_SROA_0]] +; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]] ; entry: %A = alloca [8 x i8] @@ -188,16 +187,44 @@ ret i64 %Z } +%struct = type { [256 x i8], i32 } + +define i65 @volatile_store_addrspacecast_slice(i65 %X, i16 %idx) { +; CHECK-LABEL: @volatile_store_addrspacecast_slice( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast [9 x i8]* [[A_SROA_1]] to i65 addrspace(1)* +; CHECK-NEXT: store volatile i65 [[X:%.*]], i65 addrspace(1)* [[A_SROA_1_0_GEPB_SROA_CAST]] +; CHECK-NEXT: br label [[L2:%.*]] +; CHECK: L2: +; CHECK-NEXT: [[A_SROA_0_0_GEPA_BC_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to i65* +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load i65, i65* [[A_SROA_0_0_GEPA_BC_SROA_CAST]] +; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]] +; +entry: + %A = alloca %struct + %B = addrspacecast %struct* %A to i65 addrspace(1)* + %gepA = getelementptr %struct, %struct* %A, i32 0, i32 0, i16 20 + %gepB = getelementptr i65, i65 addrspace(1)* %B, i16 6 + store volatile i65 %X, i65 addrspace(1)* %gepB, align 1 + br label %L2 + +L2: + %gepA.bc = bitcast i8* %gepA to i65* + %Z = load i65, i65* %gepA.bc, align 1 + ret i65 %Z +} + ; Don't change the address space of a volatile operation define i64 @alloca_addrspacecast_bitcast_volatile_load(i64 %X) { ; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8] -; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast [8 x i8]* [[A]] to [8 x i8] addrspace(1)* -; CHECK-NEXT: [[B:%.*]] = bitcast [8 x i8] addrspace(1)* [[A_CAST]] to i64 addrspace(1)* -; CHECK-NEXT: store i64 [[X:%.*]], i64 addrspace(1)* [[B]] -; CHECK-NEXT: [[Z:%.*]] = load volatile i64, i64 addrspace(1)* [[B]] -; CHECK-NEXT: ret i64 [[Z]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64 +; CHECK-NEXT: store i64 [[X:%.*]], i64* [[A_SROA_0]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast i64* [[A_SROA_0]] to i64 addrspace(1)* +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load volatile i64, i64 addrspace(1)* [[TMP0]] +; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]] ; entry: %A = alloca [8 x i8] @@ -208,19 +235,45 @@ ret i64 %Z } +define i65 @volatile_load_addrspacecast_slice(i65 %X, i16 %idx) { +; CHECK-LABEL: @volatile_load_addrspacecast_slice( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast [9 x i8]* [[A_SROA_1]] to i65 addrspace(1)* +; CHECK-NEXT: store i65 [[X:%.*]], i65 addrspace(1)* [[A_SROA_1_0_GEPB_SROA_CAST]] +; CHECK-NEXT: br label [[L2:%.*]] +; CHECK: L2: +; CHECK-NEXT: [[A_SROA_0_0_GEPA_BC_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to i65* +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load volatile i65, i65* [[A_SROA_0_0_GEPA_BC_SROA_CAST]] +; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]] +; +entry: + %A = alloca %struct + %B = addrspacecast %struct* %A to i65 addrspace(1)* + %gepA = getelementptr %struct, %struct* %A, i32 0, i32 0, i16 20 + %gepB = getelementptr i65, i65 addrspace(1)* %B, i16 6 + store i65 %X, i65 addrspace(1)* %gepB, align 1 + br label %L2 + +L2: + %gepA.bc = bitcast i8* %gepA to i65* + %Z = load volatile i65, i65* %gepA.bc, align 1 + ret i65 %Z +} + + declare void @llvm.memset.p1i8.i32(i8 addrspace(1)* nocapture, i8, i32, i1) nounwind ; Don't change the address space of a volatile operation define i32 @volatile_memset() { ; CHECK-LABEL: @volatile_memset( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8] -; CHECK-NEXT: [[PTR:%.*]] = getelementptr [4 x i8], [4 x i8]* [[A]], i32 0, i32 0 -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast i8* [[PTR]] to i8 addrspace(1)* -; CHECK-NEXT: call void @llvm.memset.p1i8.i32(i8 addrspace(1)* [[ASC]], i8 42, i32 4, i1 true) -; CHECK-NEXT: [[IPTR:%.*]] = bitcast i8* [[PTR]] to i32* -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[IPTR]] -; CHECK-NEXT: ret i32 [[VAL]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast i32* [[A_SROA_0]] to i32 addrspace(1)* +; CHECK-NEXT: store volatile i32 707406378, i32 addrspace(1)* [[TMP0]] +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load i32, i32* [[A_SROA_0]] +; CHECK-NEXT: ret i32 [[A_SROA_0_0_A_SROA_0_0_VAL]] ; entry: %a = alloca [4 x i8] @@ -236,11 +289,15 @@ define void @volatile_memcpy(i8* %src, i8* %dst) { ; CHECK-LABEL: @volatile_memcpy( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8] -; CHECK-NEXT: [[PTR:%.*]] = getelementptr [4 x i8], [4 x i8]* [[A]], i32 0, i32 0 -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast i8* [[PTR]] to i8 addrspace(1)* -; CHECK-NEXT: call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* [[ASC]], i8* [[SRC:%.*]], i32 4, i1 true), !tbaa !0 -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p1i8.i32(i8* [[DST:%.*]], i8 addrspace(1)* [[ASC]], i32 4, i1 true), !tbaa !3 +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32 +; CHECK-NEXT: [[A_SROA_0_0_SRC_SROA_CAST:%.*]] = bitcast i8* [[SRC:%.*]] to i32* +; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, i32* [[A_SROA_0_0_SRC_SROA_CAST]], align 1, !tbaa !0 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast i32* [[A_SROA_0]] to i32 addrspace(1)* +; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_COPYLOAD]], i32 addrspace(1)* [[TMP0]], !tbaa !0 +; CHECK-NEXT: [[A_SROA_0_0_DST_SROA_CAST:%.*]] = bitcast i8* [[DST:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast i32* [[A_SROA_0]] to i32 addrspace(1)* +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1:%.*]] = load volatile i32, i32 addrspace(1)* [[TMP1]], !tbaa !3 +; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1]], i32* [[A_SROA_0_0_DST_SROA_CAST]], align 1, !tbaa !3 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SROA/basictest.ll =================================================================== --- llvm/test/Transforms/SROA/basictest.ll +++ llvm/test/Transforms/SROA/basictest.ll @@ -1255,11 +1255,10 @@ define void @PR14105_as1({ [16 x i8] } addrspace(1)* %ptr) { ; Make sure this the right address space pointer is used for type check. ; CHECK-LABEL: @PR14105_as1( -; CHECK: alloca { [16 x i8] }, align 8 -; CHECK-NEXT: %gep = getelementptr inbounds { [16 x i8] }, { [16 x i8] } addrspace(1)* %ptr, i64 -1 -; CHECK-NEXT: %cast1 = bitcast { [16 x i8] } addrspace(1)* %gep to i8 addrspace(1)* -; CHECK-NEXT: %cast2 = bitcast { [16 x i8] }* %a to i8* -; CHECK-NEXT: call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 8 %cast1, i8* align 8 %cast2, i32 16, i1 true) +; CHECK: [[ALLOCA:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds { [16 x i8] }, { [16 x i8] } addrspace(1)* %ptr, i16 -1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[ALLOCA]], i64 0, i64 0 +; CHECK-NEXT: call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 8 [[GEP0]], i8* align 8 [[GEP1]], i32 16, i1 true) entry: %a = alloca { [16 x i8] }, align 8