This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Cost model for general case of single vector permute
ClosedPublic

Authored by reames on Mar 27 2023, 1:46 PM.

Details

Summary

The cost model was not accounting for the fact that we can generate vrgather + an index expression.

One thing to call out - I did not model the difference between vrgather and vrgatherei16. The result is the constant pool cost can be slightly understated on RV32. I'm not sure if we care, if we do, I can add in the handling for that case.

Diff Detail

Event Timeline

reames created this revision.Mar 27 2023, 1:46 PM
Herald added a project: Restricted Project. · View Herald TranscriptMar 27 2023, 1:46 PM
reames requested review of this revision.Mar 27 2023, 1:46 PM
Herald added a project: Restricted Project. · View Herald TranscriptMar 27 2023, 1:46 PM
luke accepted this revision.Mar 28 2023, 3:22 AM

LGTM, this matches up with what I'm seeing generating random masks for shuffles.

Just a small observation: Could we improve codegen and use vrgatherei16 for i8 and n > 256? The generated code at the moment for a v512i8 vector shuffle is less than ideal

v512:                                   # @v512
	.cfi_startproc
# %bb.0:
	lui	a0, %hi(.LCPI1_0)
	ld	a0, %lo(.LCPI1_0)(a0)
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v12, a0
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v12
	vslideup.vi	v0, v12, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v12, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v12, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v12, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v12, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v12, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v12, 7
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v12, v8, 1
	vmv.x.s	a1, v12
	li	a0, 512
	vsetvli	zero, a0, e8, m4, ta, ma
	vmv.v.x	v12, a1
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v16, v8, 5
	vmv.x.s	a1, v16
	lui	a2, %hi(.LCPI1_1)
	ld	a2, %lo(.LCPI1_1)(a2)
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v12, v12, a1, v0
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v16, a2
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v16
	vslideup.vi	v0, v16, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v16, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v16, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v16, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v16, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v16, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v16, 7
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v16, v8, 4
	vmv.x.s	a1, v16
	lui	a2, %hi(.LCPI1_2)
	ld	a2, %lo(.LCPI1_2)(a2)
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v12, v12, a1, v0
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v16, a2
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v16
	vslideup.vi	v0, v16, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v16, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v16, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v16, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v16, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v16, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v16, 7
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v16, v8, 7
	vmv.x.s	a1, v16
	lui	a2, %hi(.LCPI1_3)
	ld	a2, %lo(.LCPI1_3)(a2)
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v12, v12, a1, v0
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v16, a2
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v16
	vslideup.vi	v0, v16, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v16, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v16, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v16, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v16, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v16, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v16, 7
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v16, v8, 3
	vmv.x.s	a1, v16
	lui	a2, %hi(.LCPI1_4)
	ld	a2, %lo(.LCPI1_4)(a2)
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v12, v12, a1, v0
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v16, a2
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v16
	vslideup.vi	v0, v16, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v16, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v16, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v16, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v16, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v16, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v16, 7
	vsetivli	zero, 0, e8, m4, ta, ma
	vmv.x.s	a1, v8
	lui	a2, %hi(.LCPI1_5)
	ld	a2, %lo(.LCPI1_5)(a2)
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v12, v12, a1, v0
	vsetivli	zero, 8, e64, m1, ta, ma
	vmv.s.x	v16, a2
	vsetivli	zero, 2, e64, m1, tu, ma
	vmv1r.v	v0, v16
	vslideup.vi	v0, v16, 1
	vsetivli	zero, 3, e64, m1, tu, ma
	vslideup.vi	v0, v16, 2
	vsetivli	zero, 4, e64, m1, tu, ma
	vslideup.vi	v0, v16, 3
	vsetivli	zero, 5, e64, m1, tu, ma
	vslideup.vi	v0, v16, 4
	vsetivli	zero, 6, e64, m1, tu, ma
	vslideup.vi	v0, v16, 5
	vsetivli	zero, 7, e64, m1, tu, ma
	vslideup.vi	v0, v16, 6
	vsetivli	zero, 8, e64, m1, ta, ma
	vslideup.vi	v0, v16, 7
	vsetivli	zero, 1, e8, m4, ta, ma
	vslidedown.vi	v8, v8, 2
	vmv.x.s	a1, v8
	vsetvli	zero, a0, e8, m4, ta, ma
	vmerge.vxm	v8, v12, a1, v0
	ret

Below is v256i8 for comparison, as we are modelling:

v256:                                   # @v256
	.cfi_startproc
# %bb.0:
	lui	a0, %hi(.LCPI0_0)
	addi	a0, a0, %lo(.LCPI0_0)
	li	a1, 32
	vsetvli	zero, a1, e64, m2, ta, ma
	vlse64.v	v12, (a0), zero
	li	a0, 256
	vsetvli	zero, a0, e8, m2, ta, ma
	vrgather.vv	v10, v8, v12
	vmv.v.v	v8, v10
	ret
This revision is now accepted and ready to land.Mar 28 2023, 3:22 AM
This revision was landed with ongoing or failed builds.Mar 28 2023, 7:34 AM
This revision was automatically updated to reflect the committed changes.