Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -94,6 +94,15 @@ Type *CastElTy = PTy->getElementType(); if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; + // This optimisation does not work for cases where the casted type + // is scalable and the allocated type is not. This because we need to + // know how many times the casted type fits into the allocated type. + // For other cases we also bail out for now until we've done some + // proper cost-benefit analysis with tests that pass all the alignment + // checks. + if (isa(AllocElTy) || isa(CastElTy)) + return nullptr; + Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy); Align CastElTyAlign = DL.getABITypeAlign(CastElTy); if (CastElTyAlign < AllocElTyAlign) return nullptr; @@ -103,14 +112,14 @@ // same, we open the door to infinite loops of various kinds. if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; - uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy); - uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy); + uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getFixedSize(); + uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getFixedSize(); if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; // If the allocation has multiple uses, only promote it if we're not // shrinking the amount of memory being allocated. - uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy); - uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy); + uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getFixedSize(); + uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getFixedSize(); if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; // See if we can satisfy the modulus by pulling a scale out of the array Index: llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @fixed_array16i32_to_scalable4i32(* %out) { +; CHECK-LABEL: @fixed_array16i32_to_scalable4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca [16 x i32], align 16 +; CHECK-NEXT: [[CAST:%.*]] = bitcast [16 x i32]* [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 16 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 16 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca [16 x i32], align 16 + %cast = bitcast [16 x i32]* %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable4i32_to_fixed16i32(<16 x i32>* %out) { +; CHECK-LABEL: @scalable4i32_to_fixed16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to <16 x i32>* + store <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16 + %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16 + store <16 x i32> %reload, <16 x i32>* %out, align 16 + ret void +} + +define void @fixed16i32_to_scalable4i32(* %out) { +; CHECK-LABEL: @fixed16i32_to_scalable4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca <16 x i32>, align 16 +; CHECK-NEXT: [[CAST:%.*]] = bitcast <16 x i32>* [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 16 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 16 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca <16 x i32>, align 16 + %cast = bitcast <16 x i32>* %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable16i32_to_fixed16i32(<16 x i32>* %out) { +; CHECK-LABEL: @scalable16i32_to_fixed16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to <16 x i32>* +; CHECK-NEXT: store volatile <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to <16 x i32>* + store volatile <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16 + %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16 + store <16 x i32> %reload, <16 x i32>* %out, align 16 + ret void +} + +define void @scalable32i32_to_scalable16i32(* %out) { +; CHECK-LABEL: @scalable32i32_to_scalable16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 64 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +}