diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5018,13 +5018,47 @@ if (ST->isTruncatingStore()) return TLI.scalarizeVectorStore(ST, DAG); + // Generate a vector-predicated store if it is custom/legal on the target. + // To avoid possible recusion, only do this if the widened mask type is + // legal. + // FIXME: Not all targets may support EVL in VP_STORE. These will have been + // removed from the IR by the ExpandVectorPredication pass but we're + // reintroducing them here. + // FIXME: We currently only perform this on scalable vector types; because + // VP_STORE isn't currently as well-optimized as STORE, it is known to + // generate worse on some fixed-length vector tests. + // FIXME: This should really go after GenWidenVectorStores, but that method + // may crash the compiler given certain scalable-vector types. We should + // rewrite it to first create a plan of action before committing to the code + // generation strategy. + SDValue StVal = ST->getValue(); + EVT StVT = StVal.getValueType(); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT); + EVT WideMaskVT = WideVT.changeVectorElementType(MVT::i1); + if (WideVT.isScalableVector() && + TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) && + TLI.isTypeLegal(WideMaskVT)) { + // Widen the value. + SDLoc DL(N); + StVal = GetWidenedVector(StVal); + SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT); + MVT EVLVT = TLI.getVPExplicitVectorLengthTy(); + unsigned NumVTElts = StVT.getVectorMinNumElements(); + SDValue EVL = + DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts)); + const auto *MMO = ST->getMemOperand(); + return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask, + EVL, MMO->getPointerInfo(), MMO->getAlign(), + MMO->getFlags(), MMO->getAAInfo()); + } + SmallVector StChain; GenWidenVectorStores(StChain, ST); if (StChain.size() == 1) return StChain[0]; - else - return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); + + return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); } SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +; Check that we are able to legalize fixed-length loads & stores that require widening. + +; FIXME: Legalizing to VP_STORE is potentially better here. + +define void @store_v3i8(<3 x i8> %val, <3 x i8>* %ptr) { +; CHECK-LABEL: store_v3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v25, v8, 2 +; CHECK-NEXT: addi a1, a0, 2 +; CHECK-NEXT: vse8.v v25, (a1) +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + store <3 x i8> %val, <3 x i8>* %ptr + ret void +} + +; FIXME: Legalizing to VP_STORE is likely better here. + +define void @store_v3f16(<3 x half> %val, <3 x half>* %ptr) { +; CHECK-LABEL: store_v3f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v25, v8, 2 +; CHECK-NEXT: vfmv.f.s ft0, v25 +; CHECK-NEXT: fsh ft0, 4(a0) +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret + store <3 x half> %val, <3 x half>* %ptr + ret void +} + +; FIXME: One vector load is likely better here: see load_v5f32. + +define <3 x i8> @load_v3i8(<3 x i8>* %ptr) { +; CHECK-LABEL: load_v3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 12(sp) +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = load <3 x i8>, <3 x i8>* %ptr + ret <3 x i8> %v +} + +; FIXME: Would loading 5 elements be any better than 8? + +define <5 x float> @load_v5f32(<5 x float>* %ptr) { +; CHECK-LABEL: load_v5f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %v = load <5 x float>, <5 x float>* %ptr + ret <5 x float> %v +} + +; FIXME: We could legalize to VP_LOAD instead of scalarizing. + +define <5 x float> @load_volatile_v5f32(<5 x float>* %ptr) { +; RV32-LABEL: load_volatile_v5f32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 96 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -32 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vslidedown.vi v26, v25, 1 +; RV32-NEXT: vfmv.f.s ft0, v26 +; RV32-NEXT: vslidedown.vi v26, v25, 2 +; RV32-NEXT: vfmv.f.s ft1, v26 +; RV32-NEXT: flw ft2, 16(a0) +; RV32-NEXT: vslidedown.vi v26, v25, 3 +; RV32-NEXT: vfmv.f.s ft3, v26 +; RV32-NEXT: vfmv.f.s ft4, v25 +; RV32-NEXT: fsw ft2, 48(sp) +; RV32-NEXT: fsw ft4, 32(sp) +; RV32-NEXT: fsw ft3, 44(sp) +; RV32-NEXT: fsw ft1, 40(sp) +; RV32-NEXT: fsw ft0, 36(sp) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 +; RV32-NEXT: ret +; +; RV64-LABEL: load_volatile_v5f32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 96 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -32 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vslidedown.vi v26, v25, 1 +; RV64-NEXT: vfmv.f.s ft0, v26 +; RV64-NEXT: vslidedown.vi v26, v25, 2 +; RV64-NEXT: vfmv.f.s ft1, v26 +; RV64-NEXT: flw ft2, 16(a0) +; RV64-NEXT: vslidedown.vi v26, v25, 3 +; RV64-NEXT: vfmv.f.s ft3, v26 +; RV64-NEXT: vfmv.f.s ft4, v25 +; RV64-NEXT: fsw ft2, 48(sp) +; RV64-NEXT: fsw ft4, 32(sp) +; RV64-NEXT: fsw ft3, 44(sp) +; RV64-NEXT: fsw ft1, 40(sp) +; RV64-NEXT: fsw ft0, 36(sp) +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 +; RV64-NEXT: ret + %v = load volatile <5 x float>, <5 x float>* %ptr + ret <5 x float> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/load-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/load-store-sdnode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/load-store-sdnode.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s + +; Check that we are able to legalize loads & stores that require widening. + +define void @store_nxv3i8( %val, * %ptr) { +; CHECK-LABEL: store_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret + store %val, * %ptr + ret void +} + +define void @store_nxv7f64( %val, * %ptr) { +; CHECK-LABEL: store_nxv7f64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: sub a1, a2, a1 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret + store %val, * %ptr + ret void +} + +; FIXME: The following tests crash when using 'volatile' loads + +define @load_nxv3i8(* %ptr) { +; CHECK-LABEL: load_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %ptr + ret %v +} + +define @load_nxv5f16(* %ptr) { +; CHECK-LABEL: load_nxv5f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %ptr + ret %v +}