diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -557,6 +557,14 @@ MachineFunction &MF) const; bool useRVVForFixedLengthVectorVT(MVT VT) const; + + /// RVV code generation for fixed length vectors does not lower all + /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to + /// merge. However, merging them creates a BUILD_VECTOR that is just as + /// illegal as the original, thus leading to an infinite legalisation loop. + /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types, + /// this override can be removed. + bool mergeStoresAfterLegalization(EVT VT) const override; }; namespace RISCV { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1144,6 +1144,13 @@ return {SubRegIdx, InsertExtractIdx}; } +// Permit combining of mask vectors as BUILD_VECTOR never expands to scalar +// stores for those types. +bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const { + return !Subtarget.useRVVForFixedLengthVectors() || + (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1); +} + static bool useRVVForFixedLengthVectorVT(MVT VT, const RISCVSubtarget &Subtarget) { assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!"); @@ -1372,8 +1379,7 @@ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { // If we have to use more than one INSERT_VECTOR_ELT then this // optimization is likely to increase code size; avoid peforming it in - // such a case. We can go through the stack as long as we're at least - // byte-sized. + // such a case. We can use a load from a constant pool in this case. if (DAG.shouldOptForSize() && NumElts > NumViaIntegerBits) return SDValue(); // Now we can create our integer vector type. Note that it may be larger diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 ; Tests that a floating-point build_vector doesn't try and generate a VID ; instruction @@ -20,12 +20,81 @@ ret void } +; Not all BUILD_VECTORs are successfully lowered by the backend: some are +; expanded into scalarized stack stores. However, this may result in an +; infinite loop in the DAGCombiner which tries to recombine those stores into a +; BUILD_VECTOR followed by a vector store. The BUILD_VECTOR is then expanded +; and the loop begins. +; Until all BUILD_VECTORs are lowered, we disable store-combining after +; legalization for fixed-length vectors. +; This test uses a trick with a shufflevector which can't be lowered to a +; SHUFFLE_VECTOR node; the mask is shorter than the source vectors and the +; shuffle indices aren't located within the same 4-element subvector, so is +; expanded to 4 EXTRACT_VECTOR_ELTs and a BUILD_VECTOR. This then triggers the +; loop when expanded. +define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize { +; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -16 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: vsetvli zero, zero, e32,m2,ta,mu +; LMULMAX1-NEXT: vfmv.f.s ft0, v10 +; LMULMAX1-NEXT: fsw ft0, 8(sp) +; LMULMAX1-NEXT: vfmv.f.s ft0, v8 +; LMULMAX1-NEXT: fsw ft0, 0(sp) +; LMULMAX1-NEXT: vsetivli a0, 1, e32,m2,ta,mu +; LMULMAX1-NEXT: vslidedown.vi v26, v10, 7 +; LMULMAX1-NEXT: vfmv.f.s ft0, v26 +; LMULMAX1-NEXT: fsw ft0, 12(sp) +; LMULMAX1-NEXT: vslidedown.vi v26, v8, 7 +; LMULMAX1-NEXT: vfmv.f.s ft0, v26 +; LMULMAX1-NEXT: fsw ft0, 4(sp) +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (sp) +; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: ret +; +; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX2-NEXT: vmv.v.i v25, 0 +; LMULMAX2-NEXT: vrgather.vv v26, v8, v25 +; LMULMAX2-NEXT: addi a0, zero, 2 +; LMULMAX2-NEXT: vsetivli a1, 1, e8,mf8,ta,mu +; LMULMAX2-NEXT: vmv.s.x v0, a0 +; LMULMAX2-NEXT: lui a0, %hi(.LCPI1_0) +; LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI1_0) +; LMULMAX2-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v27, (a0) +; LMULMAX2-NEXT: vsetivli a0, 4, e32,m1,tu,mu +; LMULMAX2-NEXT: vrgather.vv v26, v9, v27, v0.t +; LMULMAX2-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX2-NEXT: vrgather.vv v27, v10, v25 +; LMULMAX2-NEXT: addi a0, zero, 8 +; LMULMAX2-NEXT: vsetivli a1, 1, e8,mf8,ta,mu +; LMULMAX2-NEXT: vmv.s.x v0, a0 +; LMULMAX2-NEXT: lui a0, %hi(.LCPI1_1) +; LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI1_1) +; LMULMAX2-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v25, (a0) +; LMULMAX2-NEXT: vsetivli a0, 4, e32,m1,tu,mu +; LMULMAX2-NEXT: vrgather.vv v27, v11, v25, v0.t +; LMULMAX2-NEXT: addi a0, zero, 3 +; LMULMAX2-NEXT: vsetivli a1, 1, e8,mf8,ta,mu +; LMULMAX2-NEXT: vmv.s.x v0, a0 +; LMULMAX2-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX2-NEXT: vmerge.vvm v8, v27, v26, v0 +; LMULMAX2-NEXT: ret + %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> + ret <4 x float> %z +} + define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI1_0) +; CHECK-NEXT: lui a1, %hi(.LCPI2_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI2_0) ; CHECK-NEXT: vlse32.v v25, (a1), zero ; CHECK-NEXT: fmv.w.x ft0, zero ; CHECK-NEXT: vfmv.s.f v26, ft0 @@ -61,8 +130,8 @@ define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) { ; CHECK-LABEL: buildvec_dominant2_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK-NEXT: flw ft0, %lo(.LCPI3_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI4_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu ; CHECK-NEXT: vfmv.s.f v25, ft0 ; CHECK-NEXT: vfmv.v.f v26, fa0 @@ -84,8 +153,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, zero, 6 ; CHECK-NEXT: vsetivli a2, 1, e8,mf8,ta,mu -; CHECK-NEXT: lui a2, %hi(.LCPI4_0) -; CHECK-NEXT: flw ft0, %lo(.LCPI4_0)(a2) +; CHECK-NEXT: lui a2, %hi(.LCPI5_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI5_0)(a2) ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu ; CHECK-NEXT: vfmv.v.f v25, fa0