Index: llvm/lib/Transforms/Scalar/SROA.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/SROA.cpp
+++ llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2474,6 +2474,9 @@
 
     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                      NewAI.getAlign(), "load");
+    cast<Instruction>(V)->copyMetadata(
+        NewAI, {LLVMContext::MD_mem_parallel_loop_access,
+                LLVMContext::MD_access_group});
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
@@ -2482,6 +2485,9 @@
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                      NewAI.getAlign(), "load");
+    cast<Instruction>(V)->copyMetadata(
+        NewAI, {LLVMContext::MD_mem_parallel_loop_access,
+                LLVMContext::MD_access_group});
     V = convertValue(DL, IRB, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
@@ -2529,6 +2535,8 @@
       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                               NewAI.getAlign(), LI.isVolatile(),
                                               LI.getName());
+      NewLI->copyMetadata(NewAI, {LLVMContext::MD_mem_parallel_loop_access,
+                                  LLVMContext::MD_access_group});
       if (AATags)
         NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
@@ -2572,6 +2580,8 @@
         NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+      NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
+                               LLVMContext::MD_access_group});
 
       V = NewLI;
       IsPtrAdjusted = true;
@@ -2631,6 +2641,8 @@
       V = insertVector(IRB, Old, V, BeginIndex, "vec");
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
+    Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
+                             LLVMContext::MD_access_group});
     if (AATags)
       Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     Pass.DeadInsts.push_back(&SI);
@@ -2646,6 +2658,9 @@
         IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                          NewAI.getAlign(), "oldload");
+      cast<Instruction>(Old)->copyMetadata(
+          NewAI, {LLVMContext::MD_mem_parallel_loop_access,
+                  LLVMContext::MD_access_group});
       Old = convertValue(DL, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
@@ -2890,6 +2905,8 @@
 
     StoreInst *New =
         IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
+    New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
+                           LLVMContext::MD_access_group});
     if (AATags)
       New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
@@ -3065,6 +3082,8 @@
     } else {
       LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
                                              II.isVolatile(), "copyload");
+      Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
+                              LLVMContext::MD_access_group});
       if (AATags)
         Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       Src = Load;
@@ -3085,6 +3104,8 @@
 
     StoreInst *Store = cast<StoreInst>(
         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+    Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
+                             LLVMContext::MD_access_group});
     if (AATags)
       Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -4075,7 +4096,7 @@
                            PartPtrTy, StoreBasePtr->getName() + "."),
             getAdjustedAlignment(SI, PartOffset),
             /*IsVolatile*/ false);
-        PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+        PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
                                    LLVMContext::MD_access_group});
         LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
       }
@@ -4160,6 +4181,8 @@
                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
             getAdjustedAlignment(LI, PartOffset),
             /*IsVolatile*/ false, LI->getName());
+        PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+                                  LLVMContext::MD_access_group});
       }
 
       // And store this partition.
@@ -4172,6 +4195,8 @@
                          StorePartPtrTy, StoreBasePtr->getName() + "."),
           getAdjustedAlignment(SI, PartOffset),
           /*IsVolatile*/ false);
+      PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
+                                 LLVMContext::MD_access_group});
 
       // Now build a new slice for the alloca.
       NewSlices.push_back(
Index: llvm/test/Transforms/SROA/mem-par-metadata-sroa-cast.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SROA/mem-par-metadata-sroa-cast.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+;
+; Make sure the llvm.access.group meta-data is preserved
+; when a load/store is replaced with another load/store by sroa
+; Ensure this is done for casting too.
+;
+; CHECK: entry:
+; CHECK: load i32, i32* {{.*}}, !llvm.access.group [[DISTINCT:![0-9]*]]
+; CHECK: load i32, i32* {{.*}}, !llvm.access.group [[DISTINCT]]
+; CHECK: ret void
+; CHECK: [[DISTINCT]] = distinct !{}
+
+%CMPLX = type { float, float }
+
+define dso_local void @test() {
+entry:
+  %PART = alloca %CMPLX, align 8
+  %PREV = alloca %CMPLX, align 8
+  %r2 = getelementptr %CMPLX, %CMPLX* %PREV, i32 0, i32 0
+  store float 0.000000e+00, float* %r2, align 4
+  %i2 = getelementptr %CMPLX, %CMPLX* %PREV, i32 0, i32 1
+  store float 0.000000e+00, float* %i2, align 4
+  %dummy = sext i16 0 to i64
+  %T = getelementptr %CMPLX, %CMPLX* %PART, i64 %dummy
+  %X35 = bitcast %CMPLX* %T to i64*
+  %X36 = bitcast %CMPLX* %PREV to i64*
+  %X37 = load i64, i64* %X35, align 8, !llvm.access.group !0
+  store i64 %X37, i64* %X36, align 8
+  ret void
+}
+
+!0 = distinct !{}