Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3211,13 +3211,19 @@ Kind = improveShuffleKindFromMask(Kind, Mask); - // Check for broadcast loads. - if (Kind == TTI::SK_Broadcast) { + // Check for broadcast loads, which are supported by the LD1R instruction. + // In terms of code-size, the shuffle vector is free when a load + dup get + // folded into a LD1R. That's what we check and return here. For performance + // and reciprocal throughput, a LD1R is not completely free. In this case, we + // return the cost for the broadcast below (i.e. 1 for most/all types), so + // that we model the load + dup sequence slightly higher because LD1R is a + // high latency instruction. + if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { bool IsLoad = !Args.empty() && isa(Args[0]); if (IsLoad && LT.second.isVector() && isLegalBroadcastLoad(Tp->getElementType(), LT.second.getVectorElementCount())) - return 0; // broadcast is handled by ld1r + return 0; } // If we have 4 elements for the shuffle and a Mask, get the cost straight Index: llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll +++ llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=CODESIZE ; These tests check the costs of ld1r instructions, through the ; isLegalBroadcastLoad method. @@ -15,46 +16,93 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv4i8 = load <4 x i8>, ptr undef, align 4 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4i8 = shufflevector <4 x i8> %lv4i8, <4 x i8> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8i8 = load <8 x i8>, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv16i8 = load <16 x i8>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %lv2i16 = load <2 x i16>, ptr undef, align 4 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2i16 = shufflevector <2 x i16> %lv2i16, <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4i16 = load <4 x i16>, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8i16 = load <8 x i16>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv16i16 = load <16 x i16>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i32 = load <2 x i32>, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4i32 = load <4 x i32>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv8i32 = load <8 x i32>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i64 = load <2 x i64>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv4i64 = load <4 x i64>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f16 = load <2 x half>, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4f16 = load <4 x half>, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8f16 = load <8 x half>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv16f16 = load <16 x half>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f32 = load <2 x float>, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4f32 = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv8f32 = load <8 x float>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f64 = load <2 x double>, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv4f64 = load <4 x double>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CODESIZE-LABEL: 'shuffle' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i8 = load <2 x i8>, ptr undef, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2i8 = shufflevector <2 x i8> %lv2i8, <2 x i8> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4i8 = load <4 x i8>, ptr undef, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv4i8 = shufflevector <4 x i8> %lv4i8, <4 x i8> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8i8 = load <8 x i8>, ptr undef, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i8 = shufflevector <8 x i8> %lv8i8, <8 x i8> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv16i8 = load <16 x i8>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16i8 = shufflevector <16 x i8> %lv16i8, <16 x i8> undef, <16 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i16 = load <2 x i16>, ptr undef, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sv2i16 = shufflevector <2 x i16> %lv2i16, <2 x i16> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4i16 = load <4 x i16>, ptr undef, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i16 = shufflevector <4 x i16> %lv4i16, <4 x i16> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8i16 = load <8 x i16>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv16i16 = load <16 x i16>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i32 = load <2 x i32>, ptr undef, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4i32 = load <4 x i32>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv8i32 = load <8 x i32>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2i64 = load <2 x i64>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv4i64 = load <4 x i64>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f16 = load <2 x half>, ptr undef, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4f16 = load <4 x half>, ptr undef, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f16 = shufflevector <4 x half> %lv4f16, <4 x half> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv8f16 = load <8 x half>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv16f16 = load <16 x half>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f32 = load <2 x float>, ptr undef, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv4f32 = load <4 x float>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv8f32 = load <8 x float>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lv2f64 = load <2 x double>, ptr undef, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %lv4f64 = load <4 x double>, ptr undef, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %lv2i8 = load <2 x i8>, ptr undef %sv2i8 = shufflevector <2 x i8> %lv2i8, <2 x i8> undef, <2 x i32> zeroinitializer @@ -119,6 +167,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %lane ; +; CODESIZE-LABEL: 'ld1r_4h_float_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load half, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <4 x half> undef, half %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %lane +; entry: %tmp = load half, ptr %x, align 2 %tmp1 = insertelement <4 x half> undef, half %tmp, i32 0 @@ -133,6 +187,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %lane = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %lane ; +; CODESIZE-LABEL: 'ld1r_8h_float_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load half, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <8 x half> undef, half %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %lane = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %lane +; entry: %tmp = load half, ptr %x, align 2 %tmp1 = insertelement <8 x half> undef, half %tmp, i32 0 @@ -147,6 +207,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %lane ; +; CODESIZE-LABEL: 'ld1r_2s_float_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load float, ptr %x, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %lane +; entry: %tmp = load float, ptr %x, align 4 %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0 @@ -161,6 +227,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %lane ; +; CODESIZE-LABEL: 'ld1r_4s_float_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load float, ptr %x, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %lane +; entry: %tmp = load float, ptr %x, align 4 %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0 @@ -175,6 +247,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %lane ; +; CODESIZE-LABEL: 'ld1r_2d_double_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load double, ptr %x, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %lane +; entry: %tmp = load double, ptr %x, align 4 %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0 @@ -191,6 +269,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %lane ; +; CODESIZE-LABEL: 'ld1r_8b_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %lane +; entry: %tmp = load i8, ptr %x, align 2 %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 @@ -205,6 +289,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %lane ; +; CODESIZE-LABEL: 'ld1r_16b_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %lane +; entry: %tmp = load i8, ptr %x, align 2 %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 @@ -219,6 +309,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %lane ; +; CODESIZE-LABEL: 'ld1r_4h_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %lane +; entry: %tmp = load i16, ptr %x, align 2 %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 @@ -233,6 +329,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %lane ; +; CODESIZE-LABEL: 'ld1r_8h_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %lane +; entry: %tmp = load i16, ptr %x, align 2 %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 @@ -247,6 +349,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %lane ; +; CODESIZE-LABEL: 'ld1r_2s_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %lane +; entry: %tmp = load i32, ptr %x, align 4 %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 @@ -261,6 +369,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lane ; +; CODESIZE-LABEL: 'ld1r_4s_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %lane +; entry: %tmp = load i32, ptr %x, align 4 %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 @@ -275,6 +389,12 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %lane ; +; CODESIZE-LABEL: 'ld1r_2d_int_shuff' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %lane +; entry: %tmp = load i64, ptr %x, align 8 %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 Index: llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,21 +7,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -47,21 +46,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -88,21 +86,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -129,21 +126,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -170,21 +166,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul nnan <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul nnan <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub nnan <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd nnan <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -267,16 +262,16 @@ ; CHECK-NEXT: [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[SHUFFLE1]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x float> [[SHUFFLE1]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x float> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: