diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -717,6 +717,11 @@ "true", "Has reasonably performant unaligned scalar " "loads and stores">; +def FeatureUnalignedVectorMem + : SubtargetFeature<"unaligned-vector-mem", "EnableUnalignedVectorMem", + "true", "Has reasonably performant unaligned vector " + "loads and stores">; + def TuneNoOptimizedZeroStrideLoad : SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", "false", "Hasn't optimized (perform fewer memory operations)" diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15636,7 +15636,13 @@ return true; } - return false; + // Note: We lower an unmasked unaligned vector access to an equally sized + // e8 element type access. Given this, we effectively support all unmasked + // misaligned accesses. TODO: Work through the codegen implications of + // allowing such accesses to be formed, and considered fast. + if (Fast) + *Fast = 0; + return Subtarget.enableUnalignedVectorMem(); } bool RISCVTargetLowering::splitValueIntoRegisterParts( @@ -15811,7 +15817,8 @@ if (!isLegalElementTypeForRVV(ScalarType)) return false; - if (Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue()) + if (!Subtarget.enableUnalignedVectorMem() && + Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue()) return false; return true; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -190,11 +190,12 @@ if (isa(DataType) && !ST->useRVVForFixedLengthVectors()) return false; - if (Alignment < - DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue()) + auto *ElemType = DataType->getScalarType(); + if (!ST->enableUnalignedVectorMem() && + Alignment < DL.getTypeStoreSize(ElemType).getFixedValue()) return false; - return TLI->isLegalElementTypeForRVV(DataType->getScalarType()); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment) { @@ -212,11 +213,12 @@ if (isa(DataType) && !ST->useRVVForFixedLengthVectors()) return false; - if (Alignment < - DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue()) + auto *ElemType = DataType->getScalarType(); + if (!ST->enableUnalignedVectorMem() && + Alignment < DL.getTypeStoreSize(ElemType).getFixedValue()) return false; - return TLI->isLegalElementTypeForRVV(DataType->getScalarType()); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,ZVE64F +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN + +; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F ; The two loads are contigous and should be folded into one define void @widen_2xv4i16(ptr %x, ptr %z) { @@ -109,6 +111,46 @@ ret void } +define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { +; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned: +; CHECK-NO-MISALIGN: # %bb.0: +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) +; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 +; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) +; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 +; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2) +; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 +; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0) +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) +; CHECK-NO-MISALIGN-NEXT: ret +; +; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned: +; RV64-MISALIGN: # %bb.0: +; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-MISALIGN-NEXT: vle16.v v8, (a0) +; RV64-MISALIGN-NEXT: vse16.v v8, (a1) +; RV64-MISALIGN-NEXT: ret + %a = load <4 x i16>, ptr %x, align 1 + %b.gep = getelementptr i8, ptr %x, i64 8 + %b = load <4 x i16>, ptr %b.gep, align 1 + %c.gep = getelementptr i8, ptr %b.gep, i64 8 + %c = load <4 x i16>, ptr %c.gep, align 1 + %d.gep = getelementptr i8, ptr %c.gep, i64 8 + %d = load <4 x i16>, ptr %d.gep, align 1 + %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> + %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> + store <16 x i16> %e.2, ptr %z + ret void +} + ; Should be a strided load - with type coercion to i64 define void @strided_constant(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant: @@ -365,17 +407,23 @@ ret void } -; Shouldn't be combined because the resulting load would not be aligned define void @strided_unaligned(ptr %x, ptr %z, i64 %s) { -; CHECK-LABEL: strided_unaligned: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: vse16.v v8, (a1) -; CHECK-NEXT: ret +; CHECK-NO-MISALIGN-LABEL: strided_unaligned: +; CHECK-NO-MISALIGN: # %bb.0: +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) +; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2 +; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) +; CHECK-NO-MISALIGN-NEXT: ret +; +; RV64-MISALIGN-LABEL: strided_unaligned: +; RV64-MISALIGN: # %bb.0: +; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2 +; RV64-MISALIGN-NEXT: vse64.v v8, (a1) +; RV64-MISALIGN-NEXT: ret %a = load <4 x i16>, ptr %x, align 1 %b.gep = getelementptr i8, ptr %x, i64 %s %b = load <4 x i16>, ptr %b.gep, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll --- a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll @@ -3,6 +3,11 @@ ; RUN: -verify-machineinstrs | FileCheck %s ; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v < %s \ ; RUN: -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+unaligned-vector-mem < %s \ +; RUN: -verify-machineinstrs | FileCheck --check-prefix=UNALIGNED %s +; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+unaligned-vector-mem < %s \ +; RUN: -verify-machineinstrs | FileCheck --check-prefix=UNALIGNED %s + define @unaligned_load_nxv1i32_a1(* %ptr) { ; CHECK-LABEL: unaligned_load_nxv1i32_a1: @@ -10,6 +15,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv1i32_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; UNALIGNED-NEXT: vle32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -20,6 +31,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv1i32_a2: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; UNALIGNED-NEXT: vle32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 2 ret %v } @@ -30,6 +47,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_load_nxv1i32_a4: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; UNALIGNED-NEXT: vle32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 4 ret %v } @@ -39,6 +62,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl1r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv1i64_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl1re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -48,6 +76,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl1r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv1i64_a4: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl1re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 4 ret %v } @@ -57,6 +90,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl1re64.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_load_nxv1i64_a8: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl1re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 8 ret %v } @@ -66,6 +104,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv2i64_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -75,6 +118,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv2i64_a4: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 4 ret %v } @@ -84,6 +132,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2re64.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_load_nxv2i64_a8: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re64.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 8 ret %v } @@ -95,6 +148,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv1i1_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; UNALIGNED-NEXT: vlm.v v0, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -104,6 +163,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv4f32_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -113,6 +177,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv4f32_a2: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 2 ret %v } @@ -122,6 +191,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2re32.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_load_nxv4f32_a4: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re32.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 4 ret %v } @@ -131,6 +205,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_load_nxv8f16_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re16.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 1 ret %v } @@ -140,6 +219,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vl2re16.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_load_nxv8f16_a2: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vl2re16.v v8, (a0) +; UNALIGNED-NEXT: ret %v = load , * %ptr, align 2 ret %v } @@ -149,6 +233,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_store_nxv4i32_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vs2r.v v8, (a0) +; UNALIGNED-NEXT: ret store %x, * %ptr, align 1 ret void } @@ -158,6 +247,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_store_nxv4i32_a2: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vs2r.v v8, (a0) +; UNALIGNED-NEXT: ret store %x, * %ptr, align 2 ret void } @@ -167,6 +261,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_store_nxv4i32_a4: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vs2r.v v8, (a0) +; UNALIGNED-NEXT: ret store %x, * %ptr, align 4 ret void } @@ -177,6 +276,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: unaligned_store_nxv1i16_a1: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; UNALIGNED-NEXT: vse16.v v8, (a0) +; UNALIGNED-NEXT: ret store %x, * %ptr, align 1 ret void } @@ -187,6 +292,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret +; +; UNALIGNED-LABEL: aligned_store_nxv1i16_a2: +; UNALIGNED: # %bb.0: +; UNALIGNED-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; UNALIGNED-NEXT: vse16.v v8, (a0) +; UNALIGNED-NEXT: ret store %x, * %ptr, align 2 ret void }