This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/test/
-
test/
-
CodeGen/
-
X86/
3/5
avx-builtins.c
-
avx512f-builtins.c
-
builtins-ppc-p10vector.c
-
builtins-ppc-p9vector.c
1
builtinshufflevector2.c
-
ext-vector.c
-
CodeGenOpenCL/
-
as_type.cl
-
partial_initializer.cl
1
preserve_vec3.cl
-
vector_literals.cl
-
vector_shufflevector.cl
-
llvm/
-
include/
-
llvm-c/
-
Core.h
-
llvm/IR/
-
IR/
-
Instructions.h
-
lib/
-
Analysis/
-
InstructionSimplify.cpp
-
ValueTracking.cpp
-
IR/
-
AsmWriter.cpp
-
ConstantFold.cpp
-
Core.cpp
-
Instructions.cpp
-
Transforms/
-
InstCombine/
-
InstCombineCalls.cpp
-
InstCombineSelect.cpp
-
InstCombineSimplifyDemanded.cpp
-
InstCombineVectorOps.cpp
-
InstructionCombining.cpp
-
Scalar/
-
ScalarizeMaskedMemIntrin.cpp
-
Vectorize/
-
SLPVectorizer.cpp
-
VectorCombine.cpp
-
test/
-
Analysis/CostModel/
-
CostModel/
-
AMDGPU/
-
shufflevector.ll
-
X86/
-
reduction.ll
-
shuffle-extract_subvector.ll
-
shuffle-insert_subvector.ll
-
shuffle-single-src.ll
-
CodeGen/
-
AMDGPU/
-
rewrite-out-arguments.ll
-
Generic/
-
expand-experimental-reductions.ll
-
PowerPC/
-
arg_promotion.ll
-
Transforms/
-
CodeGenPrepare/X86/
-
X86/
-
x86-shuffle-sink-inseltpoison.ll
-
x86-shuffle-sink.ll
-
DeadStoreElimination/
-
masked-dead-store-inseltpoison.ll
-
masked-dead-store.ll
-
InstCombine/
-
AMDGPU/
-
amdgcn-demanded-vector-elts-inseltpoison.ll
-
amdgcn-demanded-vector-elts.ll
-
X86/
-
x86-addsub-inseltpoison.ll
-
x86-addsub.ll
-
x86-avx2-inseltpoison.ll
-
x86-avx2.ll
-
x86-avx512-inseltpoison.ll
-
x86-avx512.ll
-
x86-muldq-inseltpoison.ll
-
x86-muldq.ll
-
x86-pack-inseltpoison.ll
-
x86-pack.ll
-
x86-pshufb-inseltpoison.ll
-
x86-pshufb.ll
-
x86-sse4a-inseltpoison.ll
-
x86-sse4a.ll
-
x86-vpermil-inseltpoison.ll
-
x86-vpermil.ll
-
broadcast-inseltpoison.ll
-
broadcast.ll
-
bswap-inseltpoison.ll
-
bswap.ll
-
canonicalize-vector-insert.ll
-
extractelement-inseltpoison.ll
-
extractelement.ll
-
insert-const-shuf.ll
-
insert-extract-shuffle-inseltpoison.ll
-
insert-extract-shuffle.ll
-
masked_intrinsics-inseltpoison.ll
-
masked_intrinsics.ll
-
nsw-inseltpoison.ll
-
nsw.ll
-
reduction-shufflevector.ll
-
select-extractelement-inseltpoison.ll
-
select-extractelement.ll
-
select-select.ll
-
shuffle-select-narrow-inseltpoison.ll
-
shuffle-select-narrow.ll
-
shuffle_select-inseltpoison.ll
-
shuffle_select.ll
-
shufflevec-bitcast-inseltpoison.ll
-
shufflevec-bitcast.ll
-
shufflevector-div-rem-inseltpoison.ll
-
shufflevector-div-rem.ll
-
sub-of-negatible-inseltpoison.ll
-
sub-of-negatible.ll
-
trunc-inseltpoison.ll
-
trunc.ll
-
type_pun-inseltpoison.ll
-
type_pun.ll
-
vec-binop-select-inseltpoison.ll
-
vec-binop-select.ll
-
vec_demanded_elts-inseltpoison.ll
-
vec_demanded_elts.ll
-
vec_shuffle-inseltpoison.ll
-
vec_shuffle.ll
-
vector-casts.ll
-
vector-concat-binop-inseltpoison.ll
-
vector-concat-binop.ll
-
InstSimplify/
-
shufflevector-inseltpoison.ll
-
shufflevector.ll
-
InterleavedAccess/
-
AArch64/
-
interleaved-accesses-inseltpoison.ll
-
interleaved-accesses.ll
-
ARM/
-
interleaved-accesses-inseltpoison.ll
-
interleaved-accesses.ll
-
X86/
-
interleaved-accesses-64bits-avx-inseltpoison.ll
-
interleaved-accesses-64bits-avx.ll
-
interleavedStore-inseltpoison.ll
-
interleavedStore.ll
-
LoopUnroll/X86/
-
X86/
-
pr46430-inseltpoison.ll
-
pr46430.ll
-
LoopVectorize/
-
AArch64/
-
interleaved-store-of-first-order-recurrence.ll
-
X86/
-
x86-interleaved-store-accesses-with-gaps.ll
-
interleaved-accesses.ll
-
LowerMatrixIntrinsics/
-
bigger-expressions-double.ll
-
const-gep.ll
-
load-align-volatile.ll
-
multiply-add-sub-double-row-major.ll
-
multiply-double-contraction-fmf.ll
-
multiply-double-contraction.ll
-
multiply-double-row-major.ll
-
multiply-double.ll
-
multiply-float-contraction-fmf.ll
-
multiply-float-contraction.ll
-
multiply-float.ll
-
multiply-i32-row-major.ll
-
multiply-i32.ll
-
multiply-left-transpose-row-major.ll
-
multiply-right-transpose.ll
-
preserve-existing-fast-math-flags.ll
-
propagate-backwards-unsupported.ll
-
strided-load-double.ll
-
strided-load-float.ll
-
strided-load-i32.ll
-
transpose-double.ll
-
transpose-float.ll
-
transpose-i32.ll
-
transpose-opts.ll
-
PhaseOrdering/
-
ARM/
-
mve-floatreduce.ll
-
X86/
-
vector-reductions-expanded.ll
-
vector-reductions-logical.ll
-
vector-reductions.ll
-
SLPVectorizer/
-
AArch64/
-
accelerate-vector-functions-inseltpoison.ll
-
accelerate-vector-functions.ll
-
transpose-inseltpoison.ll
-
transpose.ll
-
vectorize-free-extracts-inserts.ll
-
AMDGPU/
-
add_sub_sat-inseltpoison.ll
-
add_sub_sat.ll
-
SystemZ/
-
pr34619.ll
-
X86/
-
PR35865-inseltpoison.ll
-
PR35865.ll
-
alternate-calls-inseltpoison.ll
-
alternate-calls.ll
-
alternate-cast-inseltpoison.ll
-
alternate-cast.ll
-
alternate-fp-inseltpoison.ll
-
alternate-fp.ll
-
alternate-int-inseltpoison.ll
-
alternate-int.ll
-
arith-fp-inseltpoison.ll
-
arith-fp.ll
-
blending-shuffle-inseltpoison.ll
-
blending-shuffle.ll
-
cmp_commute-inseltpoison.ll
-
cmp_commute.ll
-
extract_with_non_const_index.ll
-
insert-element-build-vector-inseltpoison.ll
-
insert-element-build-vector.ll
-
insert-shuffle.ll
-
load-merge-inseltpoison.ll
-
load-merge.ll
-
matched-shuffled-entries.ll
-
phi.ll
-
pr47629-inseltpoison.ll
-
pr47629.ll
-
pr47642.ll
-
pr49081.ll
-
remark_extract_broadcast.ll
-
resched.ll
-
sitofp-inseltpoison.ll
-
sitofp.ll
-
vec_list_bias-inseltpoison.ll
-
vec_list_bias.ll
-
SROA/
-
slice-width.ll
-
vector-promotion.ll
-
VectorCombine/
-
AArch64/
-
load-extractelement-scalarization.ll
-
AMDGPU/
-
as-transition-inseltpoison.ll
-
as-transition.ll
-
X86/
-
extract-binop-inseltpoison.ll
-
extract-binop.ll
-
extract-cmp-binop.ll
-
extract-cmp.ll
-
load-inseltpoison.ll
-
load.ll
-
tools/llvm-c-test/
-
llvm-c-test/
-
echo.cpp

Differential D103874

[IR] Rename the shufflevector's undef mask to poison
Needs ReviewPublic

Authored by aqjune on Jun 8 2021, 12:36 AM.

Download Raw Diff

Details

Reviewers

nikic
efriedma
spatel
fhahn
lebedev.ri
RKSimon
deadalnix

Summary

This is a patch that renames shufflevector's undef mask to poison.

By D93818, shufflevector's undef mask isn't undef anymore; it returns poison instead.

%v = shufflevector <2 x i8> %x, <2 x i8> %y, <2 x i8> <i8 0, i8 poison>
; %v[0] = %x[0]
; %v[1] = poison

Since poison is more undefined than undef, this validates many existing transformations that we wanted to support.
Also, this allows more aggressive optimizations because poison is more propagative (e.g. poison & 0 = poison whereas undef & 0 != undef).

This patch updates shufflevector mask's printed string to be poison to match its new semantics.

This has changes in clang tests as well.
They are mainly about vector intrinsics being lowered into shufflevector.
The unused elements were filled with undef previously, but with this patch they are filled with poison.
Since they are unused elements anyway, I believe this isn't a functional change in fact.
But, I'm happy with this being double-checked by someone who works on these intrinsics as well.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

aqjune created this revision.Jun 8 2021, 12:36 AM

Herald added a reviewer: deadalnix. · View Herald TranscriptJun 8 2021, 12:36 AM

Herald added subscribers: dexonsmith, kerbowa, pengfei and 7 others. · View Herald Transcript

aqjune requested review of this revision.Jun 8 2021, 12:36 AM

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptJun 8 2021, 12:36 AM

Herald added subscribers: llvm-commits, cfe-commits. · View Herald Transcript

Harbormaster completed remote builds in B108148: Diff 350513.Jun 8 2021, 1:27 AM

I noted the cases where it looks like the undef->poison change might actually impact code using compiler intrinsic functions that have external specifications. The relevant specifications say the elements in question are "undefined", without really specifying what that means.

Currently, for the Intel intrinsics, we treat "undefined" as something more conservative than LLVM undef; see https://github.com/llvm/llvm-project/blob/d2012d965d60c3258b3a69d024491698f8aec386/clang/lib/CodeGen/CGBuiltin.cpp#L12483 . Maybe we should make the cast intrinsics more conservative to match. And maybe we should do the same for OpenCL. Would need to do some backend work to make sure we don't regress the generated code, though.

For __builtin_shufflevector, I think I'm fine with this changing the "-1" to mean poison; we don't have any external spec to conform to, and anyone explicitly passing -1 should know what they're doing. But maybe worth noting in the clang documentation.

clang/test/CodeGen/X86/avx-builtins.c
182	This change might be visible to user code.
clang/test/CodeGen/builtinshufflevector2.c
41	This might be visible to user code.
clang/test/CodeGenOpenCL/preserve_vec3.cl
27	This change might be visible to user code.

In D103874#2806519, @efriedma wrote:

I noted the cases where it looks like the undef->poison change might actually impact code using compiler intrinsic functions that have external specifications. The relevant specifications say the elements in question are "undefined", without really specifying what that means.

Currently, for the Intel intrinsics, we treat "undefined" as something more conservative than LLVM undef; see https://github.com/llvm/llvm-project/blob/d2012d965d60c3258b3a69d024491698f8aec386/clang/lib/CodeGen/CGBuiltin.cpp#L12483 . Maybe we should make the cast intrinsics more conservative to match. And maybe we should do the same for OpenCL. Would need to do some backend work to make sure we don't regress the generated code, though.

Makes sense, the PR (https://llvm.org/PR32176) that is left at the comment says it should be something like freeze poison as well. (BTW, this means the current shufflevector lowering is already incorrect as well..)

Then, _mm256_castsi128_si256 should be lowered into something like this:

%fr = freeze <2 x i64> poison
shufflevector <2 x i64> %x, <2 x i64> %fr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

BTW, the Intel intrinsic guide for _mm256_castsi128_si256 ( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi128_si256&expand=628 ) says:

This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.

We should teach the backend to understand the shufflevector+freeze form and lower it into an efficient assembly.

I *guess* teaching this form to the backend will be enough in terms of performance.
In practice, the frozen element won't be used in most of the cases; the middle-end's demanded elements analysis will trigger instcombine to almost always remove the freeze.

What do you think? If people agree with the shufflevector+freeze lowering, I'll create a separate patch that lowers this to the new freeze+shufflevector format (since it is already incorrect).

In practice, the frozen element won't be used in most of the cases; the middle-end's demanded elements analysis will trigger instcombine to almost always remove the freeze.

Well, in the cases it gets removed, it doesn't really matter what we use. It's likely if someone is reaching for these more obscure constructs, they're seeing that the compiler isn't doing what they want with more normal code, though.

What do you think? If people agree with the shufflevector+freeze lowering, I'll create a separate patch that lowers this to the new freeze+shufflevector format (since it is already incorrect).

Using "freeze poison" seems reasonable.

aqjune mentioned this in D104790: [x86] fix mm*_undefined* intrinsics to use arbitrary frozen bit pattern.Jun 23 2021, 9:03 AM

aqjune mentioned this in D93818: [LangRef] Update shufflevector's semantics to return poison if the mask is undef.Sep 20 2021, 1:57 AM

RKSimon added inline comments.Sep 20 2021, 3:36 AM

clang/test/CodeGen/X86/avx-builtins.c
182	Yes the length changing casts are worrying me as well - we could update the header to insert zero into the upper elements I suppose, in many cases these would be folded away by AVX ops implicitly zeroing the 128-bits. But we'd definitely have the potential for regressions.
1237	These look out of date - D109497 changes the loadu2 codegen to be a single 'concat' shuffle.

Rebase

Herald added a subscriber: arphaman. · View Herald TranscriptSep 26 2021, 6:20 AM

Resurrect mistakenly removed test statements

Harbormaster completed remote builds in B125751: Diff 375095.Sep 26 2021, 6:52 AM

aqjune marked an inline comment as done.Sep 26 2021, 7:04 AM

aqjune added inline comments.

clang/test/CodeGen/X86/avx-builtins.c
182	I quickly skimmed through the headers in clang/lib/Headers and listed the functions calling `__builtin_shufflevector` with at least one -1 mask operand. It seems there aren't very many, which is good news; I found 17 functions only ( list.txt563 BDownload ). But, correctly fixing these headers seems to require a lot of work. Since using the zero vector can cause performance regressions, we need to use a frozen poison (undef) vector to encode a vector having unspecified bits. A few months ago, I created D104790 to start using freeze(vector poison) for `mm_undefined` intrinsics. However, teaching the existing codebase to successfully deal with the frozen poison vector was a pretty tough job. When it comes to fixing the headers, there is even no C intrinsic function that represents a frozen poison vector AFAIK. I'll appreciate any idea or help in addressing this issue. :/

nikic mentioned this in D115526: [InstCombine] don't automatically drop poison-generating flags in SimplifyVectorDemandedElts.Dec 10 2021, 8:16 AM

spatel mentioned this in D115460: Add FMF to hasPoisonGeneratingFlags/dropPoisonGeneratingFlags.Dec 13 2021, 11:40 AM

RKSimon mentioned this in rG1b07bd9034bd: [X86] Add tests for vector widening with freeze(undef).May 13 2022, 4:12 AM

It seems llvm/lib/Target/X86/X86ISelLowering.cpp's LowerAVXCONCAT_VECTORS is relevant to efficient lowering of shufflevector %x, freeze(poison), mask.

Herald added a project: Restricted Project. · View Herald TranscriptJun 19 2022, 4:55 AM

Herald added subscribers: jsji, kosarev. · View Herald Transcript

dexonsmith removed a subscriber: dexonsmith.Jun 19 2022, 8:12 AM

In D103874#3594617, @aqjune wrote:

It seems llvm/lib/Target/X86/X86ISelLowering.cpp's LowerAVXCONCAT_VECTORS is relevant to efficient lowering of shufflevector %x, freeze(poison), mask.

After patching LowerAVXCONCAT_VECTORS, lowering https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll#L257 generates:

vblendps  $15, %ymm0, %ymm0, %ymm0

To make it fully no-op, tablegen files must be edited. I gave it a try, .td compiled successfully, but weirdly - perhaps due to either incorrect use of tablegen's pattern matcher or some hidden rule that I didn't address - vblendps is still there. The written patch is as follows:
https://github.com/aqjune/llvm-project/commit/b4393e36b33ca08ce77ae662479ceaf9a76eab8b

One of relevant, edited parts:

// llvm/lib/Target/X86/X86InstrVecCompiler.td
  def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
            (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
+
+  def : Pat<(VT (insert_subvector (freeze undef), subRC:$src, (iPTR 0))),
+            (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;

I spent some time but couldn't figure out why it does not work.
Can someone tell me whether the pattern matching is being correctly used? Any help is appreciated.

Which intrinsic are you working on here? If this is about the mm_undefined intrinsics, why do we need to change those from the current status quo of using a zero value instead of undef?

In D103874#3611483, @nikic wrote:

Which intrinsic are you working on here? If this is about the mm_undefined intrinsics, why do we need to change those from the current status quo of using a zero value instead of undef?

It is about the mm256_castpd128_pd256 intrinsic and its friends (clang/test/CodeGen/X86/avx-builtins.c, line 146).
It was previously using shufflevector with undef masks - since the results are poison, an alternative pattern as below is necessary to represent the intrinsic:

%a1 = freeze <2 x double> poison
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

In D103874#3611507, @aqjune wrote:
In D103874#3611483, @nikic wrote:

Which intrinsic are you working on here? If this is about the mm_undefined intrinsics, why do we need to change those from the current status quo of using a zero value instead of undef?

It is about the mm256_castpd128_pd256 intrinsic and its friends (clang/test/CodeGen/X86/avx-builtins.c, line 146).
It was previously using shufflevector with undef masks - since the results are poison, an alternative pattern as below is necessary to represent the intrinsic:
%a1 = freeze <2 x double> poison
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

How sure are we that we cannot simply use poison elements here? I checked what the Intel compiler guide has to say on the topic, and it uses the following wording. For "undefined" style intrinsics:

This intrinsic returns a vector of eight single precision floating point elements. The content of the vector is not specified.

For "cast" style intrinsics:

The lower 128-bits of the 256-bit resulting vector contains the source vector values; the upper 128-bits of the resulting vector are undefined. This intrinsic does not introduce extra moves to the generated code

It's not really clear what "undefined" is supposed to mean here (and how it differs from "not specified").

Unless we're aware of a specific problems in this area, I think it's okay to start out with just doing the undef -> poison replacement, and possibly backtrack if there are real-world assumptions about the specific meaning of "undefined" in this context.

PR31524 (https://github.com/llvm/llvm-project/issues/31524) discusses about the lowering of such intrinsics.
According to the PR, it seems users consider undefined intrinsics as returning unknown but consistent bits.

If it works, my updates in the draft (https://github.com/aqjune/llvm-project/commit/b4393e36b33ca08ce77ae662479ceaf9a76eab8b) will improve backends dealing with freeze(poison), which will help lowering _mm256_cast* with zero copy.

aqjune mentioned this in D130339: [CodeGen] Generate efficient assembly for freeze(poison) version of `mm*_cast*` intel intrinsics.Jul 22 2022, 2:48 AM

aqjune mentioned this in rG02e56e253302: [CodeGen] Generate efficient assembly for freeze(poison) version of `mm*_cast*`….Aug 10 2022, 9:38 PM

aqjune added inline comments.Aug 10 2022, 9:48 PM

clang/test/CodeGen/X86/avx-builtins.c
182	Okay, D130339 has finally been merged. I will make a patch that updates the `mm256_castsi128_si256` and its family functions to emit shufflevector with freeze(poison) operand.

ManuelJBrito mentioned this in D143287: [Clang][X86] Change X86 cast intrinsics to use __builtin_nondeterministic_value.Feb 3 2023, 10:25 AM

Large Diff

This large diff affects 195 files. Files without inline comments have been collapsed. Expand All Files

Revision Contents

Path

Size

clang/

test/

CodeGen/

X86/

avx-builtins.c

6 lines

avx512f-builtins.c

10 lines

builtins-ppc-p10vector.c

8 lines

builtins-ppc-p9vector.c

8 lines

builtinshufflevector2.c

2 lines

ext-vector.c

4 lines

CodeGenOpenCL/

as_type.cl

6 lines

partial_initializer.cl

4 lines

preserve_vec3.cl

2 lines

vector_literals.cl

22 lines

vector_shufflevector.cl

2 lines

llvm/

include/

llvm-c/

Core.h

6 lines

llvm/

IR/

Instructions.h

14 lines

lib/

Analysis/

InstructionSimplify.cpp

2 lines

ValueTracking.cpp

2 lines

IR/

8 lines

5 lines

2 lines

14 lines

Transforms/

InstCombine/

InstCombineCalls.cpp

4 lines

InstCombineSelect.cpp

4 lines

InstCombineSimplifyDemanded.cpp

2 lines

InstCombineVectorOps.cpp

18 lines

InstructionCombining.cpp

4 lines

Scalar/

ScalarizeMaskedMemIntrin.cpp

2 lines

Vectorize/

SLPVectorizer.cpp

38 lines

VectorCombine.cpp

10 lines

test/

Analysis/

CostModel/

AMDGPU/

shufflevector.ll

28 lines

X86/

reduction.ll

670 lines

shuffle-extract_subvector.ll

6 lines

shuffle-insert_subvector.ll

550 lines

shuffle-single-src.ll

6 lines

CodeGen/

AMDGPU/

rewrite-out-arguments.ll

14 lines

Generic/

expand-experimental-reductions.ll

34 lines

PowerPC/

arg_promotion.ll

6 lines

Transforms/

CodeGenPrepare/

X86/

x86-shuffle-sink-inseltpoison.ll

8 lines

x86-shuffle-sink.ll

8 lines

DeadStoreElimination/

masked-dead-store-inseltpoison.ll

16 lines

masked-dead-store.ll

16 lines

InstCombine/

AMDGPU/

amdgcn-demanded-vector-elts-inseltpoison.ll

14 lines

amdgcn-demanded-vector-elts.ll

14 lines

X86/

x86-addsub-inseltpoison.ll

4 lines

x86-addsub.ll

4 lines

x86-avx2-inseltpoison.ll

6 lines

x86-avx2.ll

6 lines

x86-avx512-inseltpoison.ll

56 lines

x86-avx512.ll

56 lines

x86-muldq-inseltpoison.ll

4 lines

x86-muldq.ll

4 lines

x86-pack-inseltpoison.ll

12 lines

x86-pack.ll

12 lines

x86-pshufb-inseltpoison.ll

10 lines

x86-pshufb.ll

10 lines

x86-sse4a-inseltpoison.ll

34 lines

x86-sse4a.ll

34 lines

x86-vpermil-inseltpoison.ll

22 lines

x86-vpermil.ll

22 lines

broadcast-inseltpoison.ll

6 lines

broadcast.ll

6 lines

bswap-inseltpoison.ll

2 lines

bswap.ll

2 lines

canonicalize-vector-insert.ll

16 lines

extractelement-inseltpoison.ll

4 lines

extractelement.ll

4 lines

insert-const-shuf.ll

12 lines

insert-extract-shuffle-inseltpoison.ll

58 lines

insert-extract-shuffle.ll

58 lines

masked_intrinsics-inseltpoison.ll

2 lines

masked_intrinsics.ll

2 lines

nsw-inseltpoison.ll

2 lines

nsw.ll

2 lines

reduction-shufflevector.ll

8 lines

select-extractelement-inseltpoison.ll

4 lines

select-extractelement.ll

44 lines

select-select.ll

2 lines

shuffle-select-narrow-inseltpoison.ll

12 lines

shuffle-select-narrow.ll

12 lines

shuffle_select-inseltpoison.ll

46 lines

shuffle_select.ll

46 lines

shufflevec-bitcast-inseltpoison.ll

2 lines

shufflevec-bitcast.ll

2 lines

shufflevector-div-rem-inseltpoison.ll

10 lines

shufflevector-div-rem.ll

10 lines

sub-of-negatible-inseltpoison.ll

4 lines

sub-of-negatible.ll

4 lines

trunc-inseltpoison.ll

2 lines

trunc.ll

2 lines

type_pun-inseltpoison.ll

2 lines

type_pun.ll

2 lines

vec-binop-select-inseltpoison.ll

4 lines

vec-binop-select.ll

10 lines

vec_demanded_elts-inseltpoison.ll

70 lines

vec_demanded_elts.ll

70 lines

vec_shuffle-inseltpoison.ll

98 lines

vec_shuffle.ll

98 lines

vector-casts.ll

16 lines

vector-concat-binop-inseltpoison.ll

24 lines

vector-concat-binop.ll

24 lines

InstSimplify/

shufflevector-inseltpoison.ll

8 lines

shufflevector.ll

8 lines

InterleavedAccess/

AArch64/

interleaved-accesses-inseltpoison.ll

4 lines

interleaved-accesses.ll

4 lines

ARM/

interleaved-accesses-inseltpoison.ll

146 lines

interleaved-accesses.ll

146 lines

X86/

interleaved-accesses-64bits-avx-inseltpoison.ll

2 lines

interleaved-accesses-64bits-avx.ll

2 lines

interleavedStore-inseltpoison.ll

14 lines

interleavedStore.ll

14 lines

LoopUnroll/

X86/

pr46430-inseltpoison.ll

2 lines

pr46430.ll

2 lines

LoopVectorize/

AArch64/

interleaved-store-of-first-order-recurrence.ll

2 lines

X86/

x86-interleaved-store-accesses-with-gaps.ll

6 lines

interleaved-accesses.ll

4 lines

LowerMatrixIntrinsics/

bigger-expressions-double.ll

36 lines

const-gep.ll

8 lines

load-align-volatile.ll

2 lines

multiply-add-sub-double-row-major.ll

8 lines

multiply-double-contraction-fmf.ll

8 lines

multiply-double-contraction.ll

8 lines

multiply-double-row-major.ll

36 lines

multiply-double.ll

36 lines

multiply-float-contraction-fmf.ll

8 lines

multiply-float-contraction.ll

8 lines

multiply-float.ll

36 lines

multiply-i32-row-major.ll

36 lines

multiply-i32.ll

36 lines

multiply-left-transpose-row-major.ll

8 lines

multiply-right-transpose.ll

16 lines

preserve-existing-fast-math-flags.ll

24 lines

propagate-backwards-unsupported.ll

46 lines

strided-load-double.ll

2 lines

strided-load-float.ll

2 lines

2 lines

2 lines

2 lines

2 lines

80 lines

PhaseOrdering/

ARM/

mve-floatreduce.ll

2 lines

X86/

vector-reductions-expanded.ll

44 lines

vector-reductions-logical.ll

12 lines

vector-reductions.ll

10 lines

SLPVectorizer/

AArch64/

accelerate-vector-functions-inseltpoison.ll

12 lines

accelerate-vector-functions.ll

12 lines

transpose-inseltpoison.ll

10 lines

transpose.ll

10 lines

vectorize-free-extracts-inserts.ll

18 lines

AMDGPU/

add_sub_sat-inseltpoison.ll

8 lines

add_sub_sat.ll

8 lines

SystemZ/

pr34619.ll

2 lines

X86/

PR35865-inseltpoison.ll

2 lines

PR35865.ll

2 lines

alternate-calls-inseltpoison.ll

30 lines

alternate-calls.ll

30 lines

alternate-cast-inseltpoison.ll

8 lines

alternate-cast.ll

8 lines

alternate-fp-inseltpoison.ll

4 lines

alternate-fp.ll

4 lines

alternate-int-inseltpoison.ll

60 lines

alternate-int.ll

60 lines

arith-fp-inseltpoison.ll

8 lines

arith-fp.ll

8 lines

blending-shuffle-inseltpoison.ll

2 lines

blending-shuffle.ll

2 lines

cmp_commute-inseltpoison.ll

4 lines

cmp_commute.ll

4 lines

extract_with_non_const_index.ll

2 lines

insert-element-build-vector-inseltpoison.ll

4 lines

insert-element-build-vector.ll

6 lines

insert-shuffle.ll

4 lines

load-merge-inseltpoison.ll

2 lines

load-merge.ll

2 lines

matched-shuffled-entries.ll

4 lines

phi.ll

2 lines

pr47629-inseltpoison.ll

142 lines

pr47629.ll

142 lines

pr47642.ll

2 lines

pr49081.ll

4 lines

remark_extract_broadcast.ll

2 lines

resched.ll

6 lines

sitofp-inseltpoison.ll

4 lines

sitofp.ll

8 lines

vec_list_bias-inseltpoison.ll

2 lines

vec_list_bias.ll

2 lines

SROA/

slice-width.ll

2 lines

vector-promotion.ll

8 lines

VectorCombine/

AArch64/

load-extractelement-scalarization.ll

2 lines

AMDGPU/

as-transition-inseltpoison.ll

2 lines

as-transition.ll

2 lines

X86/

extract-binop-inseltpoison.ll

40 lines

40 lines

8 lines

10 lines

38 lines

38 lines

tools/

llvm-c-test/

echo.cpp

2 lines

Diff 375095

clang/test/CodeGen/X86/avx-builtins.c

Show First 20 Lines • Show All 137 Lines • ▼ Show 20 Lines
__m256i test_mm256_castpd_si256(__m256d A) {		__m256i test_mm256_castpd_si256(__m256d A) {
// CHECK-LABEL: test_mm256_castpd_si256		// CHECK-LABEL: test_mm256_castpd_si256
// CHECK: bitcast <4 x double> %{{.*}} to <4 x i64>		// CHECK: bitcast <4 x double> %{{.*}} to <4 x i64>
return _mm256_castpd_si256(A);		return _mm256_castpd_si256(A);
}		}

__m256d test_mm256_castpd128_pd256(__m128d A) {		__m256d test_mm256_castpd128_pd256(__m128d A) {
// CHECK-LABEL: test_mm256_castpd128_pd256		// CHECK-LABEL: test_mm256_castpd128_pd256
// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>		// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
return _mm256_castpd128_pd256(A);		return _mm256_castpd128_pd256(A);
}		}

__m128d test_mm256_castpd256_pd128(__m256d A) {		__m128d test_mm256_castpd256_pd128(__m256d A) {
// CHECK-LABEL: test_mm256_castpd256_pd128		// CHECK-LABEL: test_mm256_castpd256_pd128
// CHECK: shufflevector <4 x double> %{{.}}, <4 x double> %{{.}}, <2 x i32> <i32 0, i32 1>		// CHECK: shufflevector <4 x double> %{{.}}, <4 x double> %{{.}}, <2 x i32> <i32 0, i32 1>
return _mm256_castpd256_pd128(A);		return _mm256_castpd256_pd128(A);
}		}

__m256d test_mm256_castps_pd(__m256 A) {		__m256d test_mm256_castps_pd(__m256 A) {
// CHECK-LABEL: test_mm256_castps_pd		// CHECK-LABEL: test_mm256_castps_pd
// CHECK: bitcast <8 x float> %{{.*}} to <4 x double>		// CHECK: bitcast <8 x float> %{{.*}} to <4 x double>
return _mm256_castps_pd(A);		return _mm256_castps_pd(A);
}		}

__m256i test_mm256_castps_si256(__m256 A) {		__m256i test_mm256_castps_si256(__m256 A) {
// CHECK-LABEL: test_mm256_castps_si256		// CHECK-LABEL: test_mm256_castps_si256
// CHECK: bitcast <8 x float> %{{.*}} to <4 x i64>		// CHECK: bitcast <8 x float> %{{.*}} to <4 x i64>
return _mm256_castps_si256(A);		return _mm256_castps_si256(A);
}		}

__m256 test_mm256_castps128_ps256(__m128 A) {		__m256 test_mm256_castps128_ps256(__m128 A) {
// CHECK-LABEL: test_mm256_castps128_ps256		// CHECK-LABEL: test_mm256_castps128_ps256
// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>		// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
return _mm256_castps128_ps256(A);		return _mm256_castps128_ps256(A);
}		}

__m128 test_mm256_castps256_ps128(__m256 A) {		__m128 test_mm256_castps256_ps128(__m256 A) {
// CHECK-LABEL: test_mm256_castps256_ps128		// CHECK-LABEL: test_mm256_castps256_ps128
// CHECK: shufflevector <8 x float> %{{.}}, <8 x float> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>		// CHECK: shufflevector <8 x float> %{{.}}, <8 x float> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm256_castps256_ps128(A);		return _mm256_castps256_ps128(A);
}		}

__m256i test_mm256_castsi128_si256(__m128i A) {		__m256i test_mm256_castsi128_si256(__m128i A) {
// CHECK-LABEL: test_mm256_castsi128_si256		// CHECK-LABEL: test_mm256_castsi128_si256
// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>		// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
		efriedmaUnsubmitted Not Done Reply Inline Actions This change might be visible to user code. efriedma: This change might be visible to user code.
		RKSimonUnsubmitted Not Done Reply Inline Actions Yes the length changing casts are worrying me as well - we could update the header to insert zero into the upper elements I suppose, in many cases these would be folded away by AVX ops implicitly zeroing the 128-bits. But we'd definitely have the potential for regressions. RKSimon: Yes the length changing casts are worrying me as well - we could update the header to insert…
		aqjuneAuthorUnsubmitted Done Reply Inline Actions I quickly skimmed through the headers in clang/lib/Headers and listed the functions calling `__builtin_shufflevector` with at least one -1 mask operand. It seems there aren't very many, which is good news; I found 17 functions only ( list.txt563 BDownload ). But, correctly fixing these headers seems to require a lot of work. Since using the zero vector can cause performance regressions, we need to use a frozen poison (undef) vector to encode a vector having unspecified bits. A few months ago, I created D104790 to start using freeze(vector poison) for `mm_undefined` intrinsics. However, teaching the existing codebase to successfully deal with the frozen poison vector was a pretty tough job. When it comes to fixing the headers, there is even no C intrinsic function that represents a frozen poison vector AFAIK. I'll appreciate any idea or help in addressing this issue. :/ aqjune: I quickly skimmed through the headers in clang/lib/Headers and listed the functions calling…
		aqjuneAuthorUnsubmitted Done Reply Inline Actions Okay, D130339 has finally been merged. I will make a patch that updates the `mm256_castsi128_si256` and its family functions to emit shufflevector with freeze(poison) operand. aqjune: Okay, D130339 has finally been merged. I will make a patch that updates the…
return _mm256_castsi128_si256(A);		return _mm256_castsi128_si256(A);
}		}

__m256d test_mm256_castsi256_pd(__m256i A) {		__m256d test_mm256_castsi256_pd(__m256i A) {
// CHECK-LABEL: test_mm256_castsi256_pd		// CHECK-LABEL: test_mm256_castsi256_pd
// CHECK: bitcast <4 x i64> %{{.*}} to <4 x double>		// CHECK: bitcast <4 x i64> %{{.*}} to <4 x double>
return _mm256_castsi256_pd(A);		return _mm256_castsi256_pd(A);
}		}
▲ Show 20 Lines • Show All 1,038 Lines • ▼ Show 20 Lines	__m256i test_mm256_loadu_si256(__m256i* A) {
// CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1{{$}}		// CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1{{$}}
return _mm256_loadu_si256(A);		return _mm256_loadu_si256(A);
}		}

__m256 test_mm256_loadu2_m128(float* A, float* B) {		__m256 test_mm256_loadu2_m128(float* A, float* B) {
// CHECK-LABEL: test_mm256_loadu2_m128		// CHECK-LABEL: test_mm256_loadu2_m128
// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}		// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}		// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>		// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		RKSimonUnsubmitted Done Reply Inline Actions These look out of date - D109497 changes the loadu2 codegen to be a single 'concat' shuffle. RKSimon: These look out of date - D109497 changes the loadu2 codegen to be a single 'concat' shuffle.
return _mm256_loadu2_m128(A, B);		return _mm256_loadu2_m128(A, B);
}		}

__m256d test_mm256_loadu2_m128d(double* A, double* B) {		__m256d test_mm256_loadu2_m128d(double* A, double* B) {
// CHECK-LABEL: test_mm256_loadu2_m128d		// CHECK-LABEL: test_mm256_loadu2_m128d
// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}		// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}		// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>		// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
▲ Show 20 Lines • Show All 919 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[IR] Rename the shufflevector's undef mask to poisonNeeds ReviewPublic

Details

Diff Detail

Event Timeline

Large Diff

Revision Contents

Diff 375095

clang/test/CodeGen/X86/avx-builtins.c

clang/test/CodeGen/X86/avx512f-builtins.c

clang/test/CodeGen/builtins-ppc-p10vector.c

clang/test/CodeGen/builtins-ppc-p9vector.c

clang/test/CodeGen/builtinshufflevector2.c

clang/test/CodeGen/ext-vector.c

clang/test/CodeGenOpenCL/as_type.cl

clang/test/CodeGenOpenCL/partial_initializer.cl

clang/test/CodeGenOpenCL/preserve_vec3.cl

clang/test/CodeGenOpenCL/vector_literals.cl

clang/test/CodeGenOpenCL/vector_shufflevector.cl

llvm/include/llvm-c/Core.h

llvm/include/llvm/IR/Instructions.h

llvm/lib/Analysis/InstructionSimplify.cpp

llvm/lib/Analysis/ValueTracking.cpp

llvm/lib/IR/AsmWriter.cpp

llvm/lib/IR/ConstantFold.cpp

llvm/lib/IR/Core.cpp

llvm/lib/IR/Instructions.cpp

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp

llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

llvm/test/Analysis/CostModel/X86/reduction.ll

llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll

llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll

llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll

llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll

llvm/test/CodeGen/Generic/expand-experimental-reductions.ll

llvm/test/CodeGen/PowerPC/arg_promotion.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll

llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-addsub.ll

llvm/test/Transforms/InstCombine/X86/x86-avx2-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-avx2.ll

llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-avx512.ll

llvm/test/Transforms/InstCombine/X86/x86-muldq-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-muldq.ll

llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-pack.ll

llvm/test/Transforms/InstCombine/X86/x86-pshufb-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll

llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll

llvm/test/Transforms/InstCombine/X86/x86-vpermil-inseltpoison.ll

llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll

llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll

llvm/test/Transforms/InstCombine/broadcast.ll

llvm/test/Transforms/InstCombine/bswap-inseltpoison.ll

llvm/test/Transforms/InstCombine/bswap.ll

llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll

llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll

llvm/test/Transforms/InstCombine/extractelement.ll

llvm/test/Transforms/InstCombine/insert-const-shuf.ll

llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll

llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll

llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll

llvm/test/Transforms/InstCombine/masked_intrinsics.ll

llvm/test/Transforms/InstCombine/nsw-inseltpoison.ll

llvm/test/Transforms/InstCombine/nsw.ll

llvm/test/Transforms/InstCombine/reduction-shufflevector.ll

[IR] Rename the shufflevector's undef mask to poison
Needs ReviewPublic