Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -94,6 +94,15 @@
   Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
+  // This optimisation does not work for cases where the casted type
+  // is scalable and the allocated type is not. This because we need to
+  // know how many times the casted type fits into the allocated type.
+  // For other cases we also bail out for now until we've done some
+  // proper cost-benefit analysis with tests that pass all the alignment
+  // checks.
+  if (isa<ScalableVectorType>(AllocElTy) || isa<ScalableVectorType>(CastElTy))
+    return nullptr;
+
   Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
   Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
   if (CastElTyAlign < AllocElTyAlign) return nullptr;
@@ -103,14 +112,14 @@
   // same, we open the door to infinite loops of various kinds.
   if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
 
-  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
-  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getFixedSize();
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getFixedSize();
   if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we're not
   // shrinking the amount of memory being allocated.
-  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
-  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getFixedSize();
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getFixedSize();
   if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
 
   // See if we can satisfy the modulus by pulling a scale out of the array
Index: llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define void @fixed_array16i32_to_scalable4i32(<vscale x 4 x i32>* %out) {
+; CHECK-LABEL: @fixed_array16i32_to_scalable4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [16 x i32], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast [16 x i32]* [[TMP]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[RELOAD]], <vscale x 4 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca [16 x i32], align 16
+  %cast = bitcast [16 x i32]* %tmp to <vscale x 4 x i32>*
+  store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %cast, align 16
+  store <vscale x 4 x i32> %reload, <vscale x 4 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable4i32_to_fixed16i32(<16 x i32>* %out) {
+; CHECK-LABEL: @scalable4i32_to_fixed16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 4 x i32>* [[TMP]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 4 x i32>, align 16
+  %cast = bitcast <vscale x 4 x i32>* %tmp to <16 x i32>*
+  store <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16
+  %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16
+  store <16 x i32> %reload, <16 x i32>* %out, align 16
+  ret void
+}
+
+define void @fixed16i32_to_scalable4i32(<vscale x 4 x i32>* %out) {
+; CHECK-LABEL: @fixed16i32_to_scalable4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i32>, align 16
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <16 x i32>* [[TMP]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[RELOAD]], <vscale x 4 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <16 x i32>, align 16
+  %cast = bitcast <16 x i32>* %tmp to <vscale x 4 x i32>*
+  store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %cast, align 16
+  store <vscale x 4 x i32> %reload, <vscale x 4 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable16i32_to_fixed16i32(<16 x i32>* %out) {
+; CHECK-LABEL: @scalable16i32_to_fixed16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 16 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 16 x i32>* [[TMP]] to <16 x i32>*
+; CHECK-NEXT:    store volatile <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 16 x i32>, align 16
+  %cast = bitcast <vscale x 16 x i32>* %tmp to <16 x i32>*
+  store volatile <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16
+  %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16
+  store <16 x i32> %reload, <16 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable32i32_to_scalable16i32(<vscale x 16 x i32>* %out) {
+; CHECK-LABEL: @scalable32i32_to_scalable16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 32 x i32>* [[TMP]] to <vscale x 16 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], <vscale x 16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 32 x i32>, align 16
+  %cast = bitcast <vscale x 32 x i32>* %tmp to <vscale x 16 x i32>*
+  store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* %cast, align 16
+  store <vscale x 16 x i32> %reload, <vscale x 16 x i32>* %out, align 16
+  ret void
+}