Index: llvm/lib/Transforms/Scalar/SROA.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SROA.cpp +++ llvm/lib/Transforms/Scalar/SROA.cpp @@ -778,10 +778,6 @@ if (!IsOffsetKnown) return PI.setAborted(&LI); - if (LI.isVolatile() && - LI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&LI); - if (isa(LI.getType())) return PI.setAborted(&LI); @@ -796,10 +792,6 @@ if (!IsOffsetKnown) return PI.setAborted(&SI); - if (SI.isVolatile() && - SI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&SI); - if (isa(ValOp->getType())) return PI.setAborted(&SI); @@ -837,11 +829,6 @@ if (!IsOffsetKnown) return PI.setAborted(&II); - // Don't replace this with a store with a different address space. TODO: - // Use a store with the casted new alloca? - if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace()) - return PI.setAborted(&II); - insertUse(II, Offset, Length ? Length->getLimitedValue() : AllocSize - Offset.getLimitedValue(), (bool)Length); @@ -861,13 +848,6 @@ if (!IsOffsetKnown) return PI.setAborted(&II); - // Don't replace this with a load/store with a different address space. - // TODO: Use a store with the casted new alloca? - if (II.isVolatile() && - (II.getDestAddressSpace() != DL.getAllocaAddrSpace() || - II.getSourceAddressSpace() != DL.getAllocaAddrSpace())) - return PI.setAborted(&II); - // This side of the transfer is completely out-of-bounds, and so we can // nuke the entire transfer. However, we also need to nuke the other side // if already added to our partitions. @@ -2308,6 +2288,16 @@ // the insertion point is set to point to the user. IRBuilderTy IRB; + // Return the new alloca, addrspacecasted if required to avoid changing the + // addrspace of a volatile access. + Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) { + if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace()) + return &NewAI; + + Type *AccessTy = NewAI.getAllocatedType()->getPointerTo(AddrSpace); + return IRB.CreateAddrSpaceCast(&NewAI, AccessTy); + } + public: AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass, AllocaInst &OldAI, AllocaInst &NewAI, @@ -2508,7 +2498,9 @@ (canConvertValue(DL, NewAllocaTy, TargetTy) || (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy()))) { - LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, + Value *NewPtr = + getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); + LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName()); if (AATags) @@ -2699,8 +2691,11 @@ } V = convertValue(DL, IRB, V, NewAllocaTy); + Value *NewPtr = + getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); + NewSI = - IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile()); + IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile()); } else { unsigned AS = SI.getPointerAddressSpace(); Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS)); @@ -2873,8 +2868,9 @@ V = convertValue(DL, IRB, V, AllocaTy); } + Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); StoreInst *New = - IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile()); + IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile()); New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); if (AATags) @@ -3027,14 +3023,22 @@ } OtherPtrTy = OtherTy->getPointerTo(OtherAS); - Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); MaybeAlign SrcAlign = OtherAlign; - Value *DstPtr = &NewAI; MaybeAlign DstAlign = SliceAlign; - if (!IsDest) { - std::swap(SrcPtr, DstPtr); + if (!IsDest) std::swap(SrcAlign, DstAlign); + + Value *SrcPtr; + Value *DstPtr; + + if (IsDest) { + DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); + SrcPtr = AdjPtr; + } else { + DstPtr = AdjPtr; + SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile()); } Value *Src; @@ -4686,7 +4690,8 @@ bool Changed = false; while (!DeadInsts.empty()) { Instruction *I = dyn_cast_or_null(DeadInsts.pop_back_val()); - if (!I) continue; + if (!I) + continue; LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); // If the instruction is an alloca, find the possible dbg.declare connected Index: llvm/test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -129,6 +129,7 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + store volatile i32* %fptr, i32* addrspace(3)* null %ld = load volatile i32, i32* %fptr, align 1 ret void } @@ -141,6 +142,7 @@ define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* + store volatile i32* %fptr, i32* addrspace(3)* null store volatile i32 0, i32* %fptr, align 1 ret void } Index: llvm/test/Transforms/SROA/addrspacecast.ll =================================================================== --- llvm/test/Transforms/SROA/addrspacecast.ll +++ llvm/test/Transforms/SROA/addrspacecast.ll @@ -149,11 +149,11 @@ define i64 @alloca_addrspacecast_bitcast_volatile_store(i64 %X) { ; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8], align 1 -; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) -; CHECK-NEXT: store volatile i64 [[X:%.*]], ptr addrspace(1) [[A_CAST]], align 4 -; CHECK-NEXT: [[Z:%.*]] = load i64, ptr addrspace(1) [[A_CAST]], align 4 -; CHECK-NEXT: ret i64 [[Z]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1) +; CHECK-NEXT: store volatile i64 [[X:%.*]], ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load i64, ptr [[A_SROA_0]], align 8 +; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]] ; entry: %A = alloca [8 x i8] @@ -163,15 +163,42 @@ ret i64 %Z } +%struct = type { [256 x i8], i32 } + +define i65 @volatile_store_addrspacecast_slice(i65 %X, i16 %idx) { +; CHECK-LABEL: @volatile_store_addrspacecast_slice( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 8 +; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_1]] to ptr addrspace(1) +; CHECK-NEXT: store volatile i65 [[X:%.*]], ptr addrspace(1) [[A_SROA_1_0_GEPB_SROA_CAST]], align 8 +; CHECK-NEXT: br label [[L2:%.*]] +; CHECK: L2: +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load i65, ptr [[A_SROA_0]], align 4 +; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]] +; +entry: + %A = alloca %struct + %B = addrspacecast ptr %A to ptr addrspace(1) + %gepA = getelementptr %struct, ptr %A, i32 0, i32 0, i16 20 + %gepB = getelementptr i65, ptr addrspace(1) %B, i16 6 + store volatile i65 %X, ptr addrspace(1) %gepB, align 1 + br label %L2 + +L2: + %Z = load i65, ptr %gepA, align 1 + ret i65 %Z +} + ; Don't change the address space of a volatile operation define i64 @alloca_addrspacecast_bitcast_volatile_load(i64 %X) { ; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8], align 1 -; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) -; CHECK-NEXT: store i64 [[X:%.*]], ptr addrspace(1) [[A_CAST]], align 4 -; CHECK-NEXT: [[Z:%.*]] = load volatile i64, ptr addrspace(1) [[A_CAST]], align 4 -; CHECK-NEXT: ret i64 [[Z]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64, align 8 +; CHECK-NEXT: store i64 [[X:%.*]], ptr [[A_SROA_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1) +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load volatile i64, ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]] ; entry: %A = alloca [8 x i8] @@ -183,15 +210,40 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1) nocapture, i8, i32, i1) nounwind +define i65 @volatile_load_addrspacecast_slice(i65 %X, i16 %idx) { +; CHECK-LABEL: @volatile_load_addrspacecast_slice( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4 +; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 8 +; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_1]] to ptr addrspace(1) +; CHECK-NEXT: store i65 [[X:%.*]], ptr addrspace(1) [[A_SROA_1_0_GEPB_SROA_CAST]], align 8 +; CHECK-NEXT: br label [[L2:%.*]] +; CHECK: L2: +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load volatile i65, ptr [[A_SROA_0]], align 4 +; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]] +; +entry: + %A = alloca %struct + %B = addrspacecast ptr %A to ptr addrspace(1) + %gepA = getelementptr %struct, ptr %A, i32 0, i32 0, i16 20 + %gepB = getelementptr i65, ptr addrspace(1) %B, i16 6 + store i65 %X, ptr addrspace(1) %gepB, align 1 + br label %L2 + +L2: + %Z = load volatile i65, ptr %gepA, align 1 + ret i65 %Z +} + ; Don't change the address space of a volatile operation define i32 @volatile_memset() { ; CHECK-LABEL: @volatile_memset( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1 -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) -; CHECK-NEXT: call void @llvm.memset.p1.i32(ptr addrspace(1) [[ASC]], i8 42, i32 4, i1 true) -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: ret i32 [[VAL]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1) +; CHECK-NEXT: store volatile i32 707406378, ptr addrspace(1) [[TMP0]], align 4 +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load i32, ptr [[A_SROA_0]], align 4 +; CHECK-NEXT: ret i32 [[A_SROA_0_0_A_SROA_0_0_VAL]] ; entry: %a = alloca [4 x i8] @@ -205,10 +257,13 @@ define void @volatile_memcpy(ptr %src, ptr %dst) { ; CHECK-LABEL: @volatile_memcpy( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1 -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) -; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) [[ASC]], ptr [[SRC:%.*]], i32 4, i1 true), !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr [[DST:%.*]], ptr addrspace(1) [[ASC]], i32 4, i1 true), !tbaa [[TBAA3:![0-9]+]] +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[SRC:%.*]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1) +; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_COPYLOAD]], ptr addrspace(1) [[TMP0]], align 4, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1) +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1:%.*]] = load volatile i32, ptr addrspace(1) [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]] +; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1]], ptr [[DST:%.*]], align 1, !tbaa [[TBAA3]] ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SROA/basictest.ll =================================================================== --- llvm/test/Transforms/SROA/basictest.ll +++ llvm/test/Transforms/SROA/basictest.ll @@ -1210,9 +1210,9 @@ ; Make sure this the right address space pointer is used for type check. ; CHECK-LABEL: @PR14105_as1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca { [16 x i8] }, align 8 +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [16 x i8], align 8 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds { [16 x i8] }, ptr addrspace(1) [[PTR:%.*]], i64 -1 -; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 8 [[GEP]], ptr align 8 [[A]], i32 16, i1 true) +; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 8 [[GEP]], ptr align 8 [[A_SROA_0]], i32 16, i1 true) ; CHECK-NEXT: ret void ; @@ -1774,9 +1774,9 @@ ; CHECK-LABEL: @PR25873( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 1123418112, ptr [[OUTDATA:%.*]], align 4 -; CHECK-NEXT: [[DOTSROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUTDATA:%.*]], i64 4 -; CHECK-NEXT: store i32 1139015680, ptr [[DOTSROA_IDX]], align 4 -; CHECK-NEXT: [[TMPDATA_SROA_6_0__SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUTDATA:%.*]], i64 8 +; CHECK-NEXT: [[OUTDATA_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUTDATA]], i64 4 +; CHECK-NEXT: store i32 1139015680, ptr [[OUTDATA_SROA_IDX]], align 4 +; CHECK-NEXT: [[TMPDATA_SROA_6_0_OUTDATA_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUTDATA]], i64 8 ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_4_0_INSERT_EXT:%.*]] = zext i32 1139015680 to i64 ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_4_0_INSERT_SHIFT:%.*]] = shl i64 [[TMPDATA_SROA_6_SROA_4_0_INSERT_EXT]], 32 ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_4_0_INSERT_MASK:%.*]] = and i64 undef, 4294967295 @@ -1784,7 +1784,7 @@ ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_0_0_INSERT_EXT:%.*]] = zext i32 1123418112 to i64 ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[TMPDATA_SROA_6_SROA_4_0_INSERT_INSERT]], -4294967296 ; CHECK-NEXT: [[TMPDATA_SROA_6_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[TMPDATA_SROA_6_SROA_0_0_INSERT_MASK]], [[TMPDATA_SROA_6_SROA_0_0_INSERT_EXT]] -; CHECK-NEXT: store i64 [[TMPDATA_SROA_6_SROA_0_0_INSERT_INSERT]], ptr [[TMPDATA_SROA_6_0__SROA_IDX]], align 4 +; CHECK-NEXT: store i64 [[TMPDATA_SROA_6_SROA_0_0_INSERT_INSERT]], ptr [[TMPDATA_SROA_6_0_OUTDATA_SROA_IDX]], align 4 ; CHECK-NEXT: ret void ; entry: