diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -26,6 +26,8 @@ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); } +/// Number of DLEN parts = (LMUL * VLEN) / DLEN. +/// Since DLEN = VLEN / 2, Num DLEN parts = 2 * LMUL. class SiFive7GetCyclesDefault { int c = !cond( !eq(mx, "M1") : 2, @@ -84,25 +86,50 @@ ); } -// Cycles for segmented loads and stores are calculated using the -// formula ceil(2 * nf * lmul). -class SiFive7GetCyclesSegmented { +/// VLDM and VSTM can't read/write more than 2 DLENs of data. +/// 2 DLENs when LMUL=8. 1 DLEN for all other DLENs +class SiFive7GetMaskLoadStoreCycles { + int c = !cond( + !eq(mx, "M8") : 2, + true : 1 + ); +} + +// Cycles for nf=2 segmented loads and stores are calculated using the +// formula (2 * VLEN * LMUL) / DLEN = 4 * LMUL +class SiFive7GetCyclesSegmentedSeg2 { int c = !cond( - !eq(mx, "M1") : !mul(!mul(2, nf), 1), - !eq(mx, "M2") : !mul(!mul(2, nf), 2), - !eq(mx, "M4") : !mul(!mul(2, nf), 4), - !eq(mx, "M8") : !mul(!mul(2, nf), 8), - // We can calculate ceil(a/b) using (a + b - 1) / b. - // Since the multiplication of fractional lmul is the - // same as division by the denominator the formula we - // use is ceil(2 * nf / lmul_denominator). We can use - // ceil(a/b) where a = 2 * nf, b = lmul_denominator. - !eq(mx, "MF2") : !div(!sub(!add(!mul(2, nf), 2), 1), 2), - !eq(mx, "MF4") : !div(!sub(!add(!mul(2, nf), 4), 1), 4), - !eq(mx, "MF8") : !div(!sub(!add(!mul(2, nf), 8), 1), 8) + !eq(mx, "M1") : 4, + !eq(mx, "M2") : 8, + !eq(mx, "M4") : 16, + !eq(mx, "M8") : 32, + !eq(mx, "MF2") : 2, + !eq(mx, "MF4") : 1, + !eq(mx, "MF8") : 1 ); } +// Cycles for segmented loads and stores are calculated using the +// formula vl * ceil((SEW * nf) / DLEN), where SEW * nf is the segment size. +class SiFive7GetCyclesSegmented { + defvar VLEN = 512; + defvar DLEN = 256; + // (VLEN * LMUL) / SEW + defvar VLUpperBound = !cond( + !eq(mx, "M1") : !div(VLEN, sew), + !eq(mx, "M2") : !div(!mul(VLEN, 2), sew), + !eq(mx, "M4") : !div(!mul(VLEN, 4), sew), + !eq(mx, "M8") : !div(!mul(VLEN, 8), sew), + !eq(mx, "MF2") : !div(!div(VLEN, 2), sew), + !eq(mx, "MF4") : !div(!div(VLEN, 4), sew), + !eq(mx, "MF8") : !div(!div(VLEN, 8), sew), + ); + // We can calculate ceil(a/b) using (a + b - 1) / b. + defvar a = !mul(sew, nf); + defvar b = DLEN; + int c = !mul(VLUpperBound, !div(!sub(!add(a, b), 1), b)); +} + class SiFive7GetCyclesOnePerElement { // FIXME: On SiFive7, VLEN is 512. Although a user can request the compiler // to use a different VLEN, this model will not make scheduling decisions @@ -359,39 +386,89 @@ } // 7. Vector Loads and Stores +// Unit-stride loads and stores can operate at the full bandwidth of the memory +// pipe. The memory pipe is DLEN bits wide on x280. foreach mx = SchedMxList in { defvar Cycles = SiFive7GetCyclesDefault.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; - let Latency = Cycles, ResourceCycles = [Cycles] in { + let Latency = 4, ResourceCycles = [Cycles] in { defm "" : LMULWriteResMX<"WriteVLDE", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTE", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDM", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTM", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDS8", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in + defm "" : LMULWriteResMX<"WriteVSTE", [SiFive7VS], mx, IsWorstCase>; +} + +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetMaskLoadStoreCycles.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = 4, ResourceCycles = [Cycles] in + defm "" : LMULWriteResMX<"WriteVLDM", [SiFive7VL], mx, IsWorstCase>; + let Latency = 1, ResourceCycles = [Cycles] in + defm "" : LMULWriteResMX<"WriteVSTM", [SiFive7VS], mx, IsWorstCase>; +} + +// Strided loads and stores operate at one element per cycle and should be +// scheduled accordingly. Indexed loads and stores operate at one element per +// cycle, and they stall the machine until all addresses have been generated, +// so they cannot be scheduled. Indexed and strided loads and stores have LMUL +// specific suffixes, but since SEW is already encoded in the name of the +// resource, we do not need to use LMULSEWXXX constructors. However, we do +// use the SEW from the name to determine the number of Cycles. +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVLDS8", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVSTS8", [SiFive7VS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFive7VS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFive7VS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { defm "" : LMULWriteResMX<"WriteVLDS16", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDS32", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDS64", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTS8", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTS16", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTS32", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTS64", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFive7VL], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFive7VL], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFive7VS], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVSTS16", [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFive7VS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVLDS32", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVSTS32", [SiFive7VS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFive7VS], mx, IsWorstCase>; + } +} +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVLDS64", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVSTS64", [SiFive7VS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFive7VS], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVLDFF", [SiFive7VL], mx, IsWorstCase>; } } @@ -414,18 +491,48 @@ let Latency = 1, ResourceCycles = [16] in def : WriteRes; +// Segmented Loads and Stores +// Unit-stride segmented loads and stores are effectively converted into strided +// segment loads and stores. Strided segment loads and stores operate at up to +// one segment per cycle if the segment fits within one aligned memory beat. +// Indexed segment loads and stores operate at the same rate as strided ones, +// but they stall the machine until all addresses have been generated. foreach mx = SchedMxList in { - foreach nf=2-8 in { - foreach eew = [8, 16, 32, 64] in { - defvar Cycles = SiFive7GetCyclesSegmented.c; + foreach eew = [8, 16, 32, 64] in { + defvar Cycles = SiFive7GetCyclesSegmentedSeg2.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + // Does not chain so set latency high + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { + defm "" : LMULWriteResMX<"WriteVLSEG2e" # eew, [SiFive7VL], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF2e" # eew, [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in + defm "" : LMULWriteResMX<"WriteVSSEG2e" # eew, [SiFive7VS], mx, IsWorstCase>; + foreach nf=3-8 in { + defvar Cycles = SiFive7GetCyclesSegmented.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; - let Latency = Cycles, ResourceCycles = [Cycles] in { + // Does not chain so set latency high + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew, [SiFive7VL], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [SiFive7VL], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew, [SiFive7VS], mx, IsWorstCase>; + } + } +} +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar Cycles = SiFive7GetCyclesSegmented.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + // Does not chain so set latency high + let Latency = !add(3, Cycles), ResourceCycles = [Cycles] in { defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew, [SiFive7VL], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [SiFive7VL], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [SiFive7VL], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew, [SiFive7VS], mx, IsWorstCase>; + } + let Latency = 1, ResourceCycles = [Cycles] in { defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew, [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [SiFive7VS], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [SiFive7VS], mx, IsWorstCase>;