This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Remove CreateShuffle lambda and reuse ShuffleBuilder functions.
ClosedPublic

Authored by ABataev on Mar 13 2023, 2:44 PM.

Details

Summary

After merging main part of the gather/buildvector code, CreateShuffle
lambda can removed and ShuffleBuilder add functions can be used instead.
Also, part of the code from CreateShuffle migrated to createShuffle of
the BaseShuffleAnalysis::createShuffle function for better code emission.

Diff Detail

Event Timeline

ABataev created this revision.Mar 13 2023, 2:44 PM
Herald added a project: Restricted Project. · View Herald TranscriptMar 13 2023, 2:44 PM
ABataev requested review of this revision.Mar 13 2023, 2:44 PM
Herald added a project: Restricted Project. · View Herald TranscriptMar 13 2023, 2:44 PM
This revision is now accepted and ready to land.Mar 13 2023, 6:18 PM

Hi Alexey,

This change causes incorrectly generated code or a crash in some cases. See an example below.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i64 0
  %3 = mul i32 %1, %2
  %4 = mul i32 0, 0
  %5 = sub i32 %3, %4
  %6 = mul i32 0, 0
  %7 = mul i32 0, %1
  %8 = add i32 %6, %7
  store i32 %5, ptr null, align 16
  store i32 %8, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 1), align 2
  %9 = extractelement <16 x i32> zeroinitializer, i64 0
  %10 = mul i32 %9, 0
  %11 = extractelement <16 x i32> zeroinitializer, i64 0
  %12 = mul i32 0, 0
  %13 = sub i32 %10, %12
  %14 = mul i32 0, %0
  %15 = mul i32 0, 0
  %16 = add i32 %14, %15
  store i32 %13, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 2), align 4
  store i32 %16, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 3), align 2
  %17 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %18 = mul i32 %17, 0
  %19 = mul i32 0, 0
  %20 = sub i32 %18, %19
  %21 = extractelement <16 x i32> zeroinitializer, i64 1
  %22 = mul i32 %21, %0
  %23 = mul i32 0, %17
  %24 = add i32 %22, %23
  store i32 %20, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 4), align 8
  store i32 %24, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 5), align 2
  %25 = mul i32 0, %11
  %26 = extractelement <16 x i32> zeroinitializer, i64 0
  %27 = mul i32 0, 0
  %28 = sub i32 %25, %27
  %29 = mul i32 0, 0
  %30 = mul i32 %26, 0
  %31 = add i32 %29, %30
  store i32 %28, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 6), align 4
  store i32 %31, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 7), align 2
  ret i32 0
}
$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
opt: .../llvm/include/llvm/ADT/SmallVector.h:298: const T& llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::operator[](llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type) const [with T = int; <template-parameter-1-2> = void; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::const_reference = const int&; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type = long unsigned int]: Assertion `idx < size()' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...

If this change (and the following ones) is reverted the code is correctly vectorized.

$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...
define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) #0 {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %3 = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %2, i32 5
  %4 = mul <8 x i32> zeroinitializer, %3
  %5 = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %1, i32 0
  %6 = insertelement <8 x i32> %5, i32 %0, i32 5
  %7 = shufflevector <8 x i32> %6, <8 x i32> %3, <8 x i32> <i32 undef, i32 0, i32 undef, i32 5, i32 13, i32 undef, i32 undef, i32 undef>
  %8 = shufflevector <8 x i32> <i32 1, i32 undef, i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0>, <8 x i32> %7, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
  %9 = mul <8 x i32> %6, %8
  %10 = sub <8 x i32> %9, %4
  %11 = add <8 x i32> %9, %4
  %12 = shufflevector <8 x i32> %10, <8 x i32> %11, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
  store <8 x i32> %12, ptr null, align 16
  ret i32 0
}

Hi Alexey,

This change causes incorrectly generated code or a crash in some cases. See an example below.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i64 0
  %3 = mul i32 %1, %2
  %4 = mul i32 0, 0
  %5 = sub i32 %3, %4
  %6 = mul i32 0, 0
  %7 = mul i32 0, %1
  %8 = add i32 %6, %7
  store i32 %5, ptr null, align 16
  store i32 %8, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 1), align 2
  %9 = extractelement <16 x i32> zeroinitializer, i64 0
  %10 = mul i32 %9, 0
  %11 = extractelement <16 x i32> zeroinitializer, i64 0
  %12 = mul i32 0, 0
  %13 = sub i32 %10, %12
  %14 = mul i32 0, %0
  %15 = mul i32 0, 0
  %16 = add i32 %14, %15
  store i32 %13, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 2), align 4
  store i32 %16, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 3), align 2
  %17 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %18 = mul i32 %17, 0
  %19 = mul i32 0, 0
  %20 = sub i32 %18, %19
  %21 = extractelement <16 x i32> zeroinitializer, i64 1
  %22 = mul i32 %21, %0
  %23 = mul i32 0, %17
  %24 = add i32 %22, %23
  store i32 %20, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 4), align 8
  store i32 %24, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 5), align 2
  %25 = mul i32 0, %11
  %26 = extractelement <16 x i32> zeroinitializer, i64 0
  %27 = mul i32 0, 0
  %28 = sub i32 %25, %27
  %29 = mul i32 0, 0
  %30 = mul i32 %26, 0
  %31 = add i32 %29, %30
  store i32 %28, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 6), align 4
  store i32 %31, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 7), align 2
  ret i32 0
}
$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
opt: .../llvm/include/llvm/ADT/SmallVector.h:298: const T& llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::operator[](llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type) const [with T = int; <template-parameter-1-2> = void; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::const_reference = const int&; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type = long unsigned int]: Assertion `idx < size()' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...

If this change (and the following ones) is reverted the code is correctly vectorized.

$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...
define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) #0 {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %3 = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %2, i32 5
  %4 = mul <8 x i32> zeroinitializer, %3
  %5 = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %1, i32 0
  %6 = insertelement <8 x i32> %5, i32 %0, i32 5
  %7 = shufflevector <8 x i32> %6, <8 x i32> %3, <8 x i32> <i32 undef, i32 0, i32 undef, i32 5, i32 13, i32 undef, i32 undef, i32 undef>
  %8 = shufflevector <8 x i32> <i32 1, i32 undef, i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0>, <8 x i32> %7, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
  %9 = mul <8 x i32> %6, %8
  %10 = sub <8 x i32> %9, %4
  %11 = add <8 x i32> %9, %4
  %12 = shufflevector <8 x i32> %10, <8 x i32> %11, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
  store <8 x i32> %12, ptr null, align 16
  ret i32 0
}

Thanks for the report, will check it tomorrow.

Hi Alexey,

This change causes incorrectly generated code or a crash in some cases. See an example below.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i64 0
  %3 = mul i32 %1, %2
  %4 = mul i32 0, 0
  %5 = sub i32 %3, %4
  %6 = mul i32 0, 0
  %7 = mul i32 0, %1
  %8 = add i32 %6, %7
  store i32 %5, ptr null, align 16
  store i32 %8, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 1), align 2
  %9 = extractelement <16 x i32> zeroinitializer, i64 0
  %10 = mul i32 %9, 0
  %11 = extractelement <16 x i32> zeroinitializer, i64 0
  %12 = mul i32 0, 0
  %13 = sub i32 %10, %12
  %14 = mul i32 0, %0
  %15 = mul i32 0, 0
  %16 = add i32 %14, %15
  store i32 %13, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 2), align 4
  store i32 %16, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 3), align 2
  %17 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %18 = mul i32 %17, 0
  %19 = mul i32 0, 0
  %20 = sub i32 %18, %19
  %21 = extractelement <16 x i32> zeroinitializer, i64 1
  %22 = mul i32 %21, %0
  %23 = mul i32 0, %17
  %24 = add i32 %22, %23
  store i32 %20, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 4), align 8
  store i32 %24, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 5), align 2
  %25 = mul i32 0, %11
  %26 = extractelement <16 x i32> zeroinitializer, i64 0
  %27 = mul i32 0, 0
  %28 = sub i32 %25, %27
  %29 = mul i32 0, 0
  %30 = mul i32 %26, 0
  %31 = add i32 %29, %30
  store i32 %28, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 6), align 4
  store i32 %31, ptr getelementptr inbounds ([9 x i32], ptr null, i64 0, i64 7), align 2
  ret i32 0
}
$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
opt: .../llvm/include/llvm/ADT/SmallVector.h:298: const T& llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::operator[](llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type) const [with T = int; <template-parameter-1-2> = void; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::const_reference = const int&; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type = long unsigned int]: Assertion `idx < size()' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...

If this change (and the following ones) is reverted the code is correctly vectorized.

$ opt test.ll -passes=slp-vectorizer -mattr=+avx512f -S
...
define i32 @main(<16 x i32> %bc47.i, <16 x i32> %bc) #0 {
entry:
  %0 = extractelement <16 x i32> %bc47.i, i64 0
  %1 = extractelement <16 x i32> %bc, i64 0
  %2 = extractelement <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i64 0
  %3 = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %2, i32 5
  %4 = mul <8 x i32> zeroinitializer, %3
  %5 = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 %1, i32 0
  %6 = insertelement <8 x i32> %5, i32 %0, i32 5
  %7 = shufflevector <8 x i32> %6, <8 x i32> %3, <8 x i32> <i32 undef, i32 0, i32 undef, i32 5, i32 13, i32 undef, i32 undef, i32 undef>
  %8 = shufflevector <8 x i32> <i32 1, i32 undef, i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0>, <8 x i32> %7, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
  %9 = mul <8 x i32> %6, %8
  %10 = sub <8 x i32> %9, %4
  %11 = add <8 x i32> %9, %4
  %12 = shufflevector <8 x i32> %10, <8 x i32> %11, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
  store <8 x i32> %12, ptr null, align 16
  ret i32 0
}

Must be fixed in 9255124a0713f1fe57e553c4266380a7087a61c6