Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -94,6 +94,18 @@ Type *CastElTy = PTy->getElementType(); if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; + // This optimisation does not work for cases where the cast type + // is scalable and the allocated type is not. This because we need to + // know how many times the casted type fits into the allocated type. + // For the opposite case where the allocated type is scalable and the + // cast type is not this leads to poor code quality due to the + // introduction of 'vscale' into the calculations. It seems better to + // bail out for this case too until we've done a proper cost-benefit + // analysis. + bool AllocIsScalable = isa(AllocElTy); + bool CastIsScalable = isa(CastElTy); + if (AllocIsScalable != CastIsScalable) return nullptr; + Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy); Align CastElTyAlign = DL.getABITypeAlign(CastElTy); if (CastElTyAlign < AllocElTyAlign) return nullptr; @@ -103,14 +115,15 @@ // same, we open the door to infinite loops of various kinds. if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; - uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy); - uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy); + // The alloc and cast types should be either both fixed or both scalable. + uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize(); + uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize(); if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; // If the allocation has multiple uses, only promote it if we're not // shrinking the amount of memory being allocated. - uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy); - uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy); + uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize(); + uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize(); if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; // See if we can satisfy the modulus by pulling a scale out of the array @@ -125,6 +138,10 @@ if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 || (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return nullptr; + // We don't currently support arrays of scalable types. + if (AllocIsScalable && (ArraySizeScale != 0 || ArrayOffset != 1)) + return nullptr; + unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize; Value *Amt = nullptr; if (Scale == 1) { Index: llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define void @fixed_array16i32_to_scalable4i32(* %out) { +; CHECK-LABEL: @fixed_array16i32_to_scalable4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca [16 x i32], align 16 +; CHECK-NEXT: [[CAST:%.*]] = bitcast [16 x i32]* [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 16 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 16 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca [16 x i32], align 16 + %cast = bitcast [16 x i32]* %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable4i32_to_fixed16i32(<16 x i32>* %out) { +; CHECK-LABEL: @scalable4i32_to_fixed16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to <16 x i32>* + store <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16 + %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16 + store <16 x i32> %reload, <16 x i32>* %out, align 16 + ret void +} + +define void @fixed16i32_to_scalable4i32(* %out) { +; CHECK-LABEL: @fixed16i32_to_scalable4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca <16 x i32>, align 16 +; CHECK-NEXT: [[CAST:%.*]] = bitcast <16 x i32>* [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 16 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 16 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca <16 x i32>, align 16 + %cast = bitcast <16 x i32>* %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable16i32_to_fixed16i32(<16 x i32>* %out) { +; CHECK-LABEL: @scalable16i32_to_fixed16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to <16 x i32>* +; CHECK-NEXT: store volatile <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to <16 x i32>* + store volatile <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16 + %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16 + store <16 x i32> %reload, <16 x i32>* %out, align 16 + ret void +} + +define void @scalable32i32_to_scalable16i32(* %out) { +; CHECK-LABEL: @scalable32i32_to_scalable16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 64 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable32i16_to_scalable16i32(* %out) { +; CHECK-LABEL: @scalable32i16_to_scalable16i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: store volatile zeroinitializer, * [[TMP]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[TMP]], align 64 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + ret void +} + +define void @scalable32i16_to_scalable16i32_multiuse(* %out, * %out2) { +; CHECK-LABEL: @scalable32i16_to_scalable16i32_multiuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 64 +; CHECK-NEXT: [[CAST:%.*]] = bitcast * [[TMP]] to * +; CHECK-NEXT: store volatile zeroinitializer, * [[CAST]], align 64 +; CHECK-NEXT: [[RELOAD:%.*]] = load volatile , * [[CAST]], align 64 +; CHECK-NEXT: store [[RELOAD]], * [[OUT:%.*]], align 16 +; CHECK-NEXT: [[RELOAD2:%.*]] = load volatile , * [[TMP]], align 64 +; CHECK-NEXT: store [[RELOAD2]], * [[OUT2:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %tmp = alloca , align 16 + %cast = bitcast * %tmp to * + store volatile zeroinitializer, * %cast, align 16 + %reload = load volatile , * %cast, align 16 + store %reload, * %out, align 16 + %reload2 = load volatile , * %tmp, align 16 + store %reload2, * %out2, align 16 + ret void +}