diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -376,6 +376,7 @@ SelectionDAG &DAG) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; + bool shouldScalarizeBinop(SDValue VecOp) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1313,6 +1313,25 @@ return true; } +bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END) + return false; + + // If the vector op is not supported, try to convert to scalar. + EVT VecVT = VecOp.getValueType(); + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) + return true; + + // If the vector op is supported, but the scalar op is not, the transform may + // not be worthwhile. + EVT ScalarVT = VecVT.getScalarType(); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + bool RISCVTargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // In order to maximise the opportunity for common subexpression elimination, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32NOM +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32M ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define i8 @extractelt_v16i8(<16 x i8>* %x) nounwind { ; CHECK-LABEL: extractelt_v16i8: @@ -613,74 +615,123 @@ } define i32 @extractelt_add_v4i32(<4 x i32> %x) { -; CHECK-LABEL: extractelt_add_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vadd.vi v8, v8, 13 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: extractelt_add_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vslidedown.vi v8, v8, 2 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: addi a0, a0, 13 +; RV32-NEXT: ret +; +; RV64-LABEL: extractelt_add_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vadd.vi v8, v8, 13 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %bo = add <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 ret i32 %ext } define i32 @extractelt_sub_v4i32(<4 x i32> %x) { -; CHECK-LABEL: extractelt_sub_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vrsub.vi v8, v8, 13 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: extractelt_sub_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vslidedown.vi v8, v8, 2 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: li a1, 13 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: extractelt_sub_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vrsub.vi v8, v8, 13 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %bo = sub <4 x i32> , %x %ext = extractelement <4 x i32> %bo, i32 2 ret i32 %ext } define i32 @extractelt_mul_v4i32(<4 x i32> %x) { -; CHECK-LABEL: extractelt_mul_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 13 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32NOM-LABEL: extractelt_mul_v4i32: +; RV32NOM: # %bb.0: +; RV32NOM-NEXT: li a0, 13 +; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32NOM-NEXT: vmul.vx v8, v8, a0 +; RV32NOM-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 +; RV32NOM-NEXT: vmv.x.s a0, v8 +; RV32NOM-NEXT: ret +; +; RV32M-LABEL: extractelt_mul_v4i32: +; RV32M: # %bb.0: +; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32M-NEXT: vslidedown.vi v8, v8, 2 +; RV32M-NEXT: vmv.x.s a0, v8 +; RV32M-NEXT: li a1, 13 +; RV32M-NEXT: mul a0, a0, a1 +; RV32M-NEXT: ret +; +; RV64-LABEL: extractelt_mul_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 13 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %bo = mul <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 ret i32 %ext } define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { -; RV32-LABEL: extractelt_sdiv_v4i32: -; RV32: # %bb.0: -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; RV32-NEXT: vslideup.vi v10, v9, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; RV32-NEXT: lui a0, %hi(.LCPI38_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI38_0) -; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: lui a0, %hi(.LCPI38_1) -; RV32-NEXT: addi a0, a0, %lo(.LCPI38_1) -; RV32-NEXT: vle32.v v11, (a0) -; RV32-NEXT: vand.vv v10, v8, v10 -; RV32-NEXT: vmulh.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: vsra.vv v9, v8, v11 -; RV32-NEXT: vsrl.vi v8, v8, 31 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 2 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: ret +; RV32NOM-LABEL: extractelt_sdiv_v4i32: +; RV32NOM: # %bb.0: +; RV32NOM-NEXT: li a0, -1 +; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32NOM-NEXT: vmv.s.x v9, a0 +; RV32NOM-NEXT: vmv.v.i v10, 0 +; RV32NOM-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV32NOM-NEXT: vslideup.vi v10, v9, 3 +; RV32NOM-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32NOM-NEXT: lui a0, %hi(.LCPI38_0) +; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_0) +; RV32NOM-NEXT: vle32.v v9, (a0) +; RV32NOM-NEXT: lui a0, %hi(.LCPI38_1) +; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_1) +; RV32NOM-NEXT: vle32.v v11, (a0) +; RV32NOM-NEXT: vand.vv v10, v8, v10 +; RV32NOM-NEXT: vmulh.vv v8, v8, v9 +; RV32NOM-NEXT: vadd.vv v8, v8, v10 +; RV32NOM-NEXT: vsra.vv v9, v8, v11 +; RV32NOM-NEXT: vsrl.vi v8, v8, 31 +; RV32NOM-NEXT: vadd.vv v8, v9, v8 +; RV32NOM-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 +; RV32NOM-NEXT: vmv.x.s a0, v8 +; RV32NOM-NEXT: ret +; +; RV32M-LABEL: extractelt_sdiv_v4i32: +; RV32M: # %bb.0: +; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32M-NEXT: vslidedown.vi v8, v8, 2 +; RV32M-NEXT: vmv.x.s a0, v8 +; RV32M-NEXT: lui a1, 322639 +; RV32M-NEXT: addi a1, a1, -945 +; RV32M-NEXT: mulh a0, a0, a1 +; RV32M-NEXT: srli a1, a0, 31 +; RV32M-NEXT: srai a0, a0, 2 +; RV32M-NEXT: add a0, a0, a1 +; RV32M-NEXT: ret ; ; RV64-LABEL: extractelt_sdiv_v4i32: ; RV64: # %bb.0: @@ -713,25 +764,56 @@ } define i32 @extractelt_udiv_v4i32(<4 x i32> %x) { -; CHECK-LABEL: extractelt_udiv_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v10, v9, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: lui a0, %hi(.LCPI39_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI39_0) -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsrl.vv v8, v8, v10 -; CHECK-NEXT: vmulhu.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32NOM-LABEL: extractelt_udiv_v4i32: +; RV32NOM: # %bb.0: +; RV32NOM-NEXT: li a0, 1 +; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32NOM-NEXT: vmv.s.x v9, a0 +; RV32NOM-NEXT: vmv.v.i v10, 0 +; RV32NOM-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV32NOM-NEXT: vslideup.vi v10, v9, 3 +; RV32NOM-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32NOM-NEXT: lui a0, %hi(.LCPI39_0) +; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI39_0) +; RV32NOM-NEXT: vle32.v v9, (a0) +; RV32NOM-NEXT: vsrl.vv v8, v8, v10 +; RV32NOM-NEXT: vmulhu.vv v8, v8, v9 +; RV32NOM-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 +; RV32NOM-NEXT: vmv.x.s a0, v8 +; RV32NOM-NEXT: srli a0, a0, 2 +; RV32NOM-NEXT: ret +; +; RV32M-LABEL: extractelt_udiv_v4i32: +; RV32M: # %bb.0: +; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32M-NEXT: vslidedown.vi v8, v8, 2 +; RV32M-NEXT: vmv.x.s a0, v8 +; RV32M-NEXT: lui a1, 322639 +; RV32M-NEXT: addi a1, a1, -945 +; RV32M-NEXT: mulhu a0, a0, a1 +; RV32M-NEXT: srli a0, a0, 2 +; RV32M-NEXT: ret +; +; RV64-LABEL: extractelt_udiv_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64-NEXT: vslideup.vi v10, v9, 3 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64-NEXT: lui a0, %hi(.LCPI39_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI39_0) +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: vmulhu.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %bo = udiv <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 ret i32 %ext @@ -742,11 +824,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI40_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfadd.vf v8, v8, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fadd.s fa0, ft1, ft0 ; CHECK-NEXT: ret %bo = fadd <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2 @@ -758,11 +839,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI41_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI41_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfrsub.vf v8, v8, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fsub.s fa0, ft0, ft1 ; CHECK-NEXT: ret %bo = fsub <4 x float> , %x %ext = extractelement <4 x float> %bo, i32 2 @@ -774,11 +854,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI42_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfmul.vf v8, v8, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fmul.s fa0, ft1, ft0 ; CHECK-NEXT: ret %bo = fmul <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2 @@ -790,11 +869,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI43_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI43_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfdiv.vf v8, v8, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s ft1, v8 +; CHECK-NEXT: fdiv.s fa0, ft1, ft0 ; CHECK-NEXT: ret %bo = fdiv <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2