This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/X86/
-
Target/
-
X86/
-
CMakeLists.txt
-
X86.h
1/6
X86FixupVectorConstants.cpp
-
X86ISelLowering.cpp
-
X86TargetMachine.cpp
-
test/CodeGen/X86/
-
CodeGen/
-
X86/
-
abdu-vector-128.ll
-
any_extend_vector_inreg_of_broadcast.ll
-
any_extend_vector_inreg_of_broadcast_from_memory.ll
-
avg.ll
-
avx-basic.ll
-
avx-logic.ll
-
avx-vbroadcast.ll
-
avx-vperm2x128.ll
-
avx2-arith.ll
-
avx2-fma-fneg-combine.ll
-
avx2-intrinsics-x86.ll
-
avx2-shift.ll
-
avx2-vbroadcast.ll
-
avx2-vector-shifts.ll
-
avx512-arith.ll
-
avx512-regcall-Mask.ll
-
bitcast-int-to-vector-bool-sext.ll
-
bitcast-int-to-vector-bool-zext.ll
-
bitcast-int-to-vector-bool.ll
-
bitcast-vector-bool.ll
-
bool-ext-inc.ll
-
broadcast-elm-cross-splat-vec.ll
-
cast-vsel.ll
-
combine-add.ll
-
combine-addo.ll
-
combine-and.ll
-
combine-bitselect.ll
2
combine-concatvectors.ll
-
combine-fabs.ll
-
combine-fcopysign.ll
-
combine-mul.ll
-
combine-pavg.ll
-
combine-pmuldq.ll
-
combine-rotates.ll
-
combine-sdiv.ll
-
combine-shl.ll
-
combine-smax.ll
-
combine-smin.ll
-
combine-srem.ll
-
combine-srl.ll
-
combine-sub-usat.ll
-
combine-udiv.ll
-
combine-urem.ll
-
concat-cast.ll
-
copy-low-subvec-elt-to-high-subvec-elt.ll
-
exedepsfix-broadcast.ll
-
expand-vp-fp-intrinsics.ll
-
expand-vp-int-intrinsics.ll
-
extractelement-fp.ll
-
extractelement-from-arg.ll
-
extractelement-legalization-cycle.ll
-
extractelement-load.ll
-
extractelement-shuffle.ll
-
fma-intrinsics-fast-isel.ll
-
fma_patterns.ll
-
fma_patterns_wide.ll
-
fold-vector-trunc-sitofp.ll
-
fp-round.ll
-
freeze-binary.ll
-
freeze-vector.ll
-
funnel-shift-rot.ll
-
gfni-funnel-shifts.ll
-
gfni-rotates.ll
-
gfni-shifts.ll
-
hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
-
hoist-and-by-const-from-shl-in-eqcmp-zero.ll
-
horizontal-reduce-umax.ll
2
horizontal-reduce-umin.ll
-
i64-to-float.ll
-
icmp-abs-C-vec.ll
-
icmp-pow2-diff.ll
-
insert-into-constant-vector.ll
-
known-bits-vector.ll
-
machine-combiner-int-vec.ll
-
masked_load.ll
-
masked_store_trunc.ll
-
masked_store_trunc_ssat.ll
-
masked_store_trunc_usat.ll
-
memset-nonzero.ll
-
merge-store-constants.ll
-
midpoint-int-vec-128.ll
-
midpoint-int-vec-256.ll
-
movmsk-cmp.ll
-
oddshuffles.ll
-
omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
-
opt-pipeline.ll
-
packss.ll
-
paddus.ll
-
pmul.ll
-
pmulh.ll
-
pr30290.ll
-
pr32368.ll
-
pr38639.ll
-
prefer-avx256-popcnt.ll
-
psubus.ll
-
recip-fastmath.ll
-
recip-fastmath2.ll
-
sadd_sat_vec.ll
-
sar_fold64.ll
-
sat-add.ll
-
sdiv-exact.ll
-
select-of-fp-constants.ll
-
setcc-non-simple-type.ll
-
shrink_vmul.ll
-
shuffle-blendw.ll
-
shuffle-of-splat-multiuses.ll
-
shuffle-vs-trunc-256.ll
-
slow-pmulld.ll
-
splat-const.ll
-
splat-for-size.ll
-
sqrt-fastmath-mir.ll
-
sqrt-fastmath-tune.ll
-
sqrt-fastmath.ll
-
srem-seteq-illegal-types.ll
-
srem-seteq-vec-nonsplat.ll
-
srem-seteq-vec-splat.ll
-
sse2.ll
-
sshl_sat_vec.ll
-
ssub_sat_vec.ll
-
subvector-broadcast.ll
-
uadd_sat_vec.ll
-
umax.ll
-
urem-seteq-vec-nonsplat.ll
-
urem-seteq-vec-nonzero.ll
-
urem-seteq-vec-splat.ll
-
urem-seteq-vec-tautological.ll
-
urem-seteq.ll
-
urem-vector-lkk.ll
-
usub_sat_vec.ll
-
v8i1-masks.ll
-
var-permute-128.ll
-
var-permute-256.ll
-
vec-strict-fptoint-128.ll
-
vec-strict-fptoint-256.ll
-
vec-strict-inttofp-256.ll
-
vec_cmp_uint-128.ll
-
vec_fabs.ll
-
vec_fp_to_int.ll
-
vec_int_to_fp.ll
-
vec_minmax_uint.ll
-
vec_shift6.ll
-
vec_smulo.ll
-
vec_uaddo.ll
-
vec_uint_to_fp-fastmath.ll
-
vec_uint_to_fp.ll
-
vec_umulo.ll
-
vec_usubo.ll
-
vector-bitreverse.ll
-
vector-blend.ll
-
vector-bo-select.ll
-
vector-constrained-fp-intrinsics.ll
-
vector-fshl-128.ll
-
vector-fshl-256.ll
-
vector-fshl-512.ll
-
vector-fshl-rot-128.ll
-
vector-fshl-rot-256.ll
-
vector-fshl-rot-512.ll
-
vector-fshl-rot-sub128.ll
-
vector-fshr-128.ll
-
vector-fshr-256.ll
-
vector-fshr-512.ll
-
vector-fshr-rot-128.ll
-
vector-fshr-rot-256.ll
-
vector-fshr-rot-512.ll
-
vector-fshr-rot-sub128.ll
-
vector-idiv-sdiv-128.ll
-
vector-idiv-sdiv-256.ll
-
vector-idiv-sdiv-512.ll
-
vector-idiv-udiv-128.ll
-
vector-idiv-udiv-256.ll
-
vector-idiv-udiv-512.ll
-
vector-interleaved-load-i16-stride-5.ll
-
vector-interleaved-load-i16-stride-7.ll
-
vector-interleaved-load-i32-stride-3.ll
-
vector-interleaved-load-i32-stride-4.ll
-
vector-interleaved-load-i32-stride-5.ll
-
vector-interleaved-load-i32-stride-6.ll
-
vector-interleaved-load-i32-stride-7.ll
-
vector-interleaved-load-i8-stride-2.ll
-
vector-interleaved-load-i8-stride-3.ll
-
vector-interleaved-load-i8-stride-4.ll
-
vector-interleaved-load-i8-stride-5.ll
-
vector-interleaved-load-i8-stride-7.ll
-
vector-interleaved-store-i16-stride-6.ll
-
vector-interleaved-store-i16-stride-7.ll
-
vector-interleaved-store-i16-stride-8.ll
-
vector-interleaved-store-i32-stride-3.ll
-
vector-interleaved-store-i32-stride-6.ll
-
vector-interleaved-store-i32-stride-7.ll
-
vector-interleaved-store-i32-stride-8.ll
-
vector-interleaved-store-i8-stride-3.ll
-
vector-interleaved-store-i8-stride-5.ll
-
vector-interleaved-store-i8-stride-7.ll
-
vector-interleaved-store-i8-stride-8.ll
-
vector-lzcnt-512.ll
-
vector-mul.ll
-
vector-pack-128.ll
-
vector-pack-256.ll
-
vector-popcnt-128-ult-ugt.ll
-
vector-popcnt-128.ll
-
vector-popcnt-256-ult-ugt.ll
-
vector-popcnt-256.ll
-
vector-popcnt-512-ult-ugt.ll
-
vector-popcnt-512.ll
-
vector-reduce-add-mask.ll
-
vector-reduce-and-bool.ll
-
vector-reduce-or-bool.ll
-
vector-reduce-or-cmp.ll
-
vector-reduce-umax.ll
-
vector-reduce-umin.ll
-
vector-reduce-xor-bool.ll
-
vector-rotate-128.ll
-
vector-rotate-256.ll
-
vector-rotate-512.ll
-
vector-sext.ll
-
vector-shift-ashr-128.ll
-
vector-shift-ashr-256.ll
-
vector-shift-ashr-512.ll
-
vector-shift-ashr-sub128.ll
-
vector-shift-lshr-256.ll
-
vector-shift-lshr-512.ll
-
vector-shift-shl-256.ll
-
vector-shift-shl-512.ll
-
vector-shuffle-128-v16.ll
-
vector-shuffle-256-v16.ll
-
vector-shuffle-256-v32.ll
-
vector-shuffle-256-v8.ll
-
vector-shuffle-avx512.ll
-
vector-shuffle-combining.ll
-
vector-trunc-math.ll
-
vector-trunc-packus.ll
-
vector-trunc-ssat.ll
-
vector-trunc-usat.ll
-
vector-trunc.ll
-
vector-tzcnt-128.ll
-
vector-tzcnt-256.ll
-
vector-tzcnt-512.ll
-
vector-unsigned-cmp.ll
-
vector-zext.ll
-
vector_splat-const-shift-of-constmasked.ll
-
vselect-avx.ll
-
vselect-minmax.ll
-
vselect-pcmp.ll
-
vselect-post-combine.ll
-
vselect-zero.ll
-
win_cst_pool.ll
-
x86-interleaved-access.ll
-
zero_extend_vector_inreg.ll
-
zero_extend_vector_inreg_of_broadcast.ll
-
zero_extend_vector_inreg_of_broadcast_from_memory.ll

Differential D150143

[X86] Add X86FixupVectorConstantsPass to fold vectors constant loads as broadcasts (WIP)
AbandonedPublic

Authored by RKSimon on May 8 2023, 2:05 PM.

Download Raw Diff

Details

Reviewers

pengfei
goldstein.w.n
craig.topper
andreadb

Summary

This is WIP patch to remove the broadcasting of constants from the DAG and to instead perform this in a later pass, I'd like to hear people's thoughts on the approach while its still in the early stages.

The principal aim is to prevent the premature creation of broadcasts that prevent us folding the loads with another instruction, helping to reduce register pressure.

There's still a lot to be addressed in this early patch including:

Subvector Broadcast handling (VBROADCASTF128 etc.).
Folding of AVX512 constant loads (including masked loads) to AVX512 broadcasts.
Folding of AVX512 instruction with folded constant loads to folded broadcasts.
Better use of AVX (fp broadcasts) and SSE3 (movddup) broadcast instructions - the comment printout are a mess of float / integer which we might want to address first?
Use of VPMOVZ/VPMOVSX extension load for non-uniform constants that are representable with smaller integers
Remove the constant support entirely from lowerBuildVectorAsBroadcast in DAG

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	3,730 ms	x64 debian > AddressSanitizer-x86_64-linux-dynamic.TestCases/Linux::auto_memory_profile_test.cpp
	4,270 ms	x64 debian > AddressSanitizer-x86_64-linux.TestCases/Linux::auto_memory_profile_test.cpp

Event Timeline

RKSimon created this revision.May 8 2023, 2:05 PM

Herald added a project: Restricted Project. · View Herald TranscriptMay 8 2023, 2:05 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

RKSimon requested review of this revision.May 8 2023, 2:05 PM

Herald added a project: Restricted Project. · View Herald TranscriptMay 8 2023, 2:05 PM

Add as much context as possible without exceeding phab's limit

Harbormaster completed remote builds in B230702: Diff 520480.May 8 2023, 3:18 PM

The principal aim is to prevent the premature creation of broadcasts that prevent us folding the loads with another instruction, helping to reduce register pressure.

There are two other problems we have with constant generation that it might also be nice to address with a new pass but that I don't think this approach will be
able to handle.

There are a variety of cases where we could get "better" constants by re-ordering instructions which is very difficult to do at any stage in the current DAG lowering process without creating an infinite loop, but is also something that would be very difficult to do after lowering at the machineinstruction level. I tried to do this for shl; and in D141653 but it ran into an infinite loop when I did more robust testing, and we could imagine also doing it for vector-compares, add/sub, etc...

There are constants like splat(1), splat(mask), splat(-mask), etc... which can often be preferable to build without a load at all (abs(ALL_ONES), shr(ALL_ONES), shl(ALL_ONES), etc...), especially if the fairly common ALL_ONES node dominates.

I think both of these goal lend themselves to a pass before DAG lowering has been completed, but after the common transforms/optimizations.
I was thinking a pass in CodeGenAndEmitDAG after the fourth post-legalization Combine but before DoInstructionSelection would make the
most sense for fixing up constant generation (including the aims of this patch).

pengfei added inline comments.May 8 2023, 9:12 PM

llvm/lib/Target/X86/X86FixupVectorConstants.cpp
208	What's the reason to check `hasInt256` rather than `hasAVX2`?
211	Should avoid FP instruction for integer?
227	Why don't use `VBROADCASTSD` here?
llvm/test/CodeGen/X86/combine-concatvectors.ll
67–68	Regression?
llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
1170–1171	This looks like regression too.

junaire added a subscriber: junaire.May 9 2023, 12:00 AM

In D150143#4328111, @goldstein.w.n wrote:

The principal aim is to prevent the premature creation of broadcasts that prevent us folding the loads with another instruction, helping to reduce register pressure.

There are two other problems we have with constant generation that it might also be nice to address with a new pass but that I don't think this approach will be
able to handle.

There are a variety of cases where we could get "better" constants by re-ordering instructions which is very difficult to do at any stage in the current DAG lowering process without creating an infinite loop, but is also something that would be very difficult to do after lowering at the machineinstruction level. I tried to do this for shl; and in D141653 but it ran into an infinite loop when I did more robust testing, and we could imagine also doing it for vector-compares, add/sub, etc...

There are constants like splat(1), splat(mask), splat(-mask), etc... which can often be preferable to build without a load at all (abs(ALL_ONES), shr(ALL_ONES), shl(ALL_ONES), etc...), especially if the fairly common ALL_ONES node dominates.

I think both of these goal lend themselves to a pass before DAG lowering has been completed, but after the common transforms/optimizations.
I was thinking a pass in CodeGenAndEmitDAG after the fourth post-legalization Combine but before DoInstructionSelection would make the
most sense for fixing up constant generation (including the aims of this patch).

I think we agree that broadcasting the constants in build vector lowering is premature and working on full width constant data is going to be a lot easier.

I have been investigating constant rematerialization as part of this patch and its looking like it should be handled separately (maybe as part of better general instruction rematerialization handling) - your proposal to investigate fitting rematerialization into CodeGenAndEmitDAG makes sense to me.

But I'm seeing most problems with premature constant broadcasting regarding folding of the constants to stack and hoisting, so a great deal of these cases are going to need to be addressed much later than DAG, I think I have this proposed pass in the correct location for this, but I will drop the proposal for it to handle constant rematerialization, this should be a true fixup for poorly lowered vector constant loads, missed AVX512 broadcast folds, etc.

I'll split off another patch.

llvm/lib/Target/X86/X86FixupVectorConstants.cpp
208	There's actually very little consistency between the 2 checks (which are identical), I try to use hasInt256 when I'm identifying cases where we're selecting 256-bit ops between AVX1 and AVX2 for 256-bit integer ops - I can change it to hasAVX2().
211	Interestingly, DAG already uses MOVDDUP in some cases to broadcast integer, but I agree we should avoid it in most cases.
227	xmm VBROADCASTSD doesn't exist - it would have exactly the same behaviour as VMOVDDUP
llvm/test/CodeGen/X86/combine-concatvectors.ll
67–68	Yes, we previously reused the ymm0 splat to avoid a second load
llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
1170–1171	Yes, we have some peepholes in DAG to reuse broadcasts of different widths - I'll address this in a later iteration.

RKSimon edited the summary of this revision. (Show Details)May 14 2023, 7:09 AM

RKSimon mentioned this in D150526: [X86] Add X86FixupVectorConstantsPass to re-fold AVX512 vector load folds as broadcast folds.May 14 2023, 8:20 AM

Matt added a subscriber: Matt.May 22 2023, 2:31 PM

RKSimon mentioned this in rG0b91de5ea32d: [X86] Add X86FixupVectorConstantsPass to re-fold AVX512 vector load folds as….May 23 2023, 3:01 AM

RKSimon mentioned this in rG0f8e0f422880: [X86] lowerBuildVectorAsBroadcast - broadcast Constant of original….May 27 2023, 6:30 AM

RKSimon abandoned this revision.Nov 2 2023, 9:34 AM

Herald added subscribers: wangpc, sunshaoce. · View Herald TranscriptNov 2 2023, 9:34 AM

Large Diff

This large diff affects 255 files. Files without inline comments have been collapsed. Expand All Files

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

CMakeLists.txt

1 line

X86.h

6 lines

X86FixupVectorConstants.cpp

255 lines

X86ISelLowering.cpp

2 lines

X86TargetMachine.cpp

1 line

test/

CodeGen/

X86/

abdu-vector-128.ll

10 lines

any_extend_vector_inreg_of_broadcast.ll

4 lines

any_extend_vector_inreg_of_broadcast_from_memory.ll

4 lines

9 lines

2 lines

127 lines

4 lines

68 lines

16 lines

avx2-fma-fneg-combine.ll

6 lines

avx2-intrinsics-x86.ll

18 lines

avx2-shift.ll

4 lines

avx2-vbroadcast.ll

38 lines

avx2-vector-shifts.ll

18 lines

avx512-arith.ll

6 lines

avx512-regcall-Mask.ll

3 lines

bitcast-int-to-vector-bool-sext.ll

21 lines

bitcast-int-to-vector-bool-zext.ll

35 lines

bitcast-int-to-vector-bool.ll

6 lines

bitcast-vector-bool.ll

63 lines

bool-ext-inc.ll

12 lines

broadcast-elm-cross-splat-vec.ll

803 lines

4 lines

24 lines

3 lines

10 lines

45 lines

combine-concatvectors.ll

3 lines

18 lines

54 lines

15 lines

38 lines

3 lines

15 lines

57 lines

27 lines

44 lines

8 lines

66 lines

9 lines

3 lines

18 lines

28 lines

17 lines

copy-low-subvec-elt-to-high-subvec-elt.ll

2 lines

exedepsfix-broadcast.ll

9 lines

expand-vp-fp-intrinsics.ll

6 lines

expand-vp-int-intrinsics.ll

12 lines

extractelement-fp.ll

38 lines

extractelement-from-arg.ll

1 line

extractelement-legalization-cycle.ll

11 lines

extractelement-load.ll

131 lines

extractelement-shuffle.ll

1 line

fma-intrinsics-fast-isel.ll

9 lines

fma_patterns.ll

42 lines

fma_patterns_wide.ll

60 lines

fold-vector-trunc-sitofp.ll

3 lines

8 lines

6 lines

25 lines

6 lines

gfni-funnel-shifts.ll

4 lines

gfni-rotates.ll

4 lines

gfni-shifts.ll

37 lines

hoist-and-by-const-from-lshr-in-eqcmp-zero.ll

18 lines

hoist-and-by-const-from-shl-in-eqcmp-zero.ll

18 lines

horizontal-reduce-umax.ll

47 lines

horizontal-reduce-umin.ll

47 lines

i64-to-float.ll

6 lines

icmp-abs-C-vec.ll

60 lines

icmp-pow2-diff.ll

5 lines

insert-into-constant-vector.ll

28 lines

known-bits-vector.ll

8 lines

machine-combiner-int-vec.ll

40 lines

masked_load.ll

3 lines

masked_store_trunc.ll

10 lines

masked_store_trunc_ssat.ll

288 lines

masked_store_trunc_usat.ll

445 lines

memset-nonzero.ll

16 lines

merge-store-constants.ll

4 lines

midpoint-int-vec-128.ll

528 lines

midpoint-int-vec-256.ll

55 lines

movmsk-cmp.ll

130 lines

oddshuffles.ll

18 lines

omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll

31 lines

1 line

26 lines

24 lines

56 lines

2 lines

2 lines

20 lines

5 lines

prefer-avx256-popcnt.ll

48 lines

54 lines

172 lines

66 lines

33 lines

12 lines

29 lines

6 lines

select-of-fp-constants.ll

5 lines

setcc-non-simple-type.ll

4 lines

shrink_vmul.ll

6 lines

shuffle-blendw.ll

6 lines

shuffle-of-splat-multiuses.ll

2 lines

shuffle-vs-trunc-256.ll

27 lines

171 lines

4 lines

182 lines

187 lines

sqrt-fastmath-tune.ll

34 lines

sqrt-fastmath.ll

53 lines

srem-seteq-illegal-types.ll

3 lines

srem-seteq-vec-nonsplat.ll

68 lines

srem-seteq-vec-splat.ll

56 lines

sse2.ll

4 lines

sshl_sat_vec.ll

15 lines

ssub_sat_vec.ll

33 lines

subvector-broadcast.ll

54 lines

uadd_sat_vec.ll

45 lines

umax.ll

42 lines

urem-seteq-vec-nonsplat.ll

6 lines

urem-seteq-vec-nonzero.ll

21 lines

urem-seteq-vec-splat.ll

31 lines

urem-seteq-vec-tautological.ll

8 lines

9 lines

50 lines

44 lines

300 lines

5 lines

41 lines

vec-strict-fptoint-128.ll

4 lines

vec-strict-fptoint-256.ll

8 lines

vec-strict-inttofp-256.ll

71 lines

8 lines

8 lines

136 lines

204 lines

8 lines

6 lines

4 lines

37 lines

vec_uint_to_fp-fastmath.ll

188 lines

232 lines

14 lines

37 lines

264 lines

20 lines

10 lines

vector-constrained-fp-intrinsics.ll

74 lines

vector-fshl-128.ll

164 lines

vector-fshl-256.ll

24 lines

vector-fshl-512.ll

16 lines

vector-fshl-rot-128.ll

45 lines

vector-fshl-rot-256.ll

11 lines

vector-fshl-rot-512.ll

6 lines

vector-fshl-rot-sub128.ll

3 lines

vector-fshr-128.ll

287 lines

vector-fshr-256.ll

32 lines

vector-fshr-512.ll

20 lines

vector-fshr-rot-128.ll

86 lines

vector-fshr-rot-256.ll

17 lines

vector-fshr-rot-512.ll

6 lines

vector-fshr-rot-sub128.ll

3 lines

vector-idiv-sdiv-128.ll

56 lines

vector-idiv-sdiv-256.ll

62 lines

vector-idiv-sdiv-512.ll

28 lines

vector-idiv-udiv-128.ll

48 lines

vector-idiv-udiv-256.ll

54 lines

vector-idiv-udiv-512.ll

18 lines

vector-interleaved-load-i16-stride-5.ll

28 lines

vector-interleaved-load-i16-stride-7.ll

929 lines

vector-interleaved-load-i32-stride-3.ll

90 lines

vector-interleaved-load-i32-stride-4.ll

1232 lines

vector-interleaved-load-i32-stride-5.ll

372 lines

vector-interleaved-load-i32-stride-6.ll

1473 lines

vector-interleaved-load-i32-stride-7.ll

5014 lines

vector-interleaved-load-i8-stride-2.ll

46 lines

vector-interleaved-load-i8-stride-3.ll

12 lines

vector-interleaved-load-i8-stride-4.ll

385 lines

vector-interleaved-load-i8-stride-5.ll

19 lines

vector-interleaved-load-i8-stride-7.ll

2844 lines

vector-interleaved-store-i16-stride-6.ll

24 lines

vector-interleaved-store-i16-stride-7.ll

6 lines

vector-interleaved-store-i16-stride-8.ll

6 lines

vector-interleaved-store-i32-stride-3.ll

12 lines

vector-interleaved-store-i32-stride-6.ll

649 lines

vector-interleaved-store-i32-stride-7.ll

655 lines

vector-interleaved-store-i32-stride-8.ll

36 lines

vector-interleaved-store-i8-stride-3.ll

9 lines

vector-interleaved-store-i8-stride-5.ll

9 lines

vector-interleaved-store-i8-stride-7.ll

102 lines

vector-interleaved-store-i8-stride-8.ll

1197 lines

24 lines

18 lines

12 lines

3 lines

vector-popcnt-128-ult-ugt.ll

999 lines

vector-popcnt-128.ll

59 lines

vector-popcnt-256-ult-ugt.ll

834 lines

vector-popcnt-256.ll

10 lines

vector-popcnt-512-ult-ugt.ll

596 lines

vector-popcnt-512.ll

10 lines

vector-reduce-add-mask.ll

14 lines

vector-reduce-and-bool.ll

63 lines

vector-reduce-or-bool.ll

63 lines

vector-reduce-or-cmp.ll

43 lines

vector-reduce-umax.ll

57 lines

vector-reduce-umin.ll

57 lines

vector-reduce-xor-bool.ll

2 lines

36 lines

11 lines

6 lines

6 lines

vector-shift-ashr-128.ll

114 lines

vector-shift-ashr-256.ll

34 lines

vector-shift-ashr-512.ll

8 lines

vector-shift-ashr-sub128.ll

87 lines

vector-shift-lshr-256.ll

9 lines

vector-shift-lshr-512.ll

6 lines

vector-shift-shl-256.ll

6 lines

vector-shift-shl-512.ll

6 lines

vector-shuffle-128-v16.ll

144 lines

vector-shuffle-256-v16.ll

19 lines

vector-shuffle-256-v32.ll

29 lines

vector-shuffle-256-v8.ll

27 lines

vector-shuffle-avx512.ll

4 lines

vector-shuffle-combining.ll

49 lines

vector-trunc-math.ll

52 lines

vector-trunc-packus.ll

193 lines

197 lines

301 lines

38 lines

334 lines

44 lines

12 lines

vector-unsigned-cmp.ll

118 lines

vector-zext.ll

3 lines

vector_splat-const-shift-of-constmasked.ll

880 lines

vselect-avx.ll

16 lines

vselect-minmax.ll

16 lines

vselect-pcmp.ll

2 lines

vselect-post-combine.ll

2 lines

vselect-zero.ll

6 lines

win_cst_pool.ll

14 lines

x86-interleaved-access.ll

109 lines

zero_extend_vector_inreg.ll

12 lines

zero_extend_vector_inreg_of_broadcast.ll

10 lines

zero_extend_vector_inreg_of_broadcast_from_memory.ll

10 lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Add X86FixupVectorConstantsPass to fold vectors constant loads as broadcasts (WIP)AbandonedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Large Diff

Revision Contents

Diff 520480

llvm/lib/Target/X86/CMakeLists.txt

llvm/lib/Target/X86/X86.h

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/lib/Target/X86/X86TargetMachine.cpp

llvm/test/CodeGen/X86/abdu-vector-128.ll

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

llvm/test/CodeGen/X86/avg.ll

llvm/test/CodeGen/X86/avx-basic.ll

llvm/test/CodeGen/X86/avx-logic.ll

llvm/test/CodeGen/X86/avx-vbroadcast.ll

llvm/test/CodeGen/X86/avx-vperm2x128.ll

llvm/test/CodeGen/X86/avx2-arith.ll

llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll

llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll

llvm/test/CodeGen/X86/avx2-shift.ll

llvm/test/CodeGen/X86/avx2-vbroadcast.ll

llvm/test/CodeGen/X86/avx2-vector-shifts.ll

llvm/test/CodeGen/X86/avx512-arith.ll

llvm/test/CodeGen/X86/avx512-regcall-Mask.ll

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

llvm/test/CodeGen/X86/bool-ext-inc.ll

llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll

llvm/test/CodeGen/X86/cast-vsel.ll

llvm/test/CodeGen/X86/combine-add.ll

llvm/test/CodeGen/X86/combine-addo.ll

llvm/test/CodeGen/X86/combine-and.ll

llvm/test/CodeGen/X86/combine-bitselect.ll

llvm/test/CodeGen/X86/combine-concatvectors.ll

llvm/test/CodeGen/X86/combine-fabs.ll

llvm/test/CodeGen/X86/combine-fcopysign.ll

llvm/test/CodeGen/X86/combine-mul.ll

llvm/test/CodeGen/X86/combine-pavg.ll

llvm/test/CodeGen/X86/combine-pmuldq.ll

llvm/test/CodeGen/X86/combine-rotates.ll

llvm/test/CodeGen/X86/combine-sdiv.ll

llvm/test/CodeGen/X86/combine-shl.ll

llvm/test/CodeGen/X86/combine-smax.ll

llvm/test/CodeGen/X86/combine-smin.ll

llvm/test/CodeGen/X86/combine-srem.ll

llvm/test/CodeGen/X86/combine-srl.ll

llvm/test/CodeGen/X86/combine-sub-usat.ll

llvm/test/CodeGen/X86/combine-udiv.ll

llvm/test/CodeGen/X86/combine-urem.ll

llvm/test/CodeGen/X86/concat-cast.ll

llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll

llvm/test/CodeGen/X86/exedepsfix-broadcast.ll

llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll

llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll

llvm/test/CodeGen/X86/extractelement-fp.ll

llvm/test/CodeGen/X86/extractelement-from-arg.ll

llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll

llvm/test/CodeGen/X86/extractelement-load.ll

llvm/test/CodeGen/X86/extractelement-shuffle.ll

llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll

llvm/test/CodeGen/X86/fma_patterns.ll

llvm/test/CodeGen/X86/fma_patterns_wide.ll

llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll

llvm/test/CodeGen/X86/fp-round.ll

llvm/test/CodeGen/X86/freeze-binary.ll

llvm/test/CodeGen/X86/freeze-vector.ll

llvm/test/CodeGen/X86/funnel-shift-rot.ll

llvm/test/CodeGen/X86/gfni-funnel-shifts.ll

llvm/test/CodeGen/X86/gfni-rotates.ll

llvm/test/CodeGen/X86/gfni-shifts.ll

llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll

llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll

[X86] Add X86FixupVectorConstantsPass to fold vectors constant loads as broadcasts (WIP)
AbandonedPublic