diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -42,6 +42,11 @@ STATISTIC(NumTailCalls, "Number of tail calls"); +cl::opt UseRVVForFixedVectors( + "riscv-use-rvv-for-fixed-vectors", cl::Hidden, + cl::desc("Allow RVV instructions to be used for fixed vector types"), + cl::init(false)); + RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -140,6 +145,22 @@ if (Subtarget.hasStdExtD()) for (MVT VT : F64VecVTs) addRegClassForRVV(VT); + + if (UseRVVForFixedVectors) { + addRegisterClass(MVT::v16i8, &RISCV::VRRegClass); + addRegisterClass(MVT::v8i16, &RISCV::VRRegClass); + addRegisterClass(MVT::v4i32, &RISCV::VRRegClass); + addRegisterClass(MVT::v2i64, &RISCV::VRRegClass); + + if (Subtarget.hasStdExtZfh()) + addRegisterClass(MVT::v8f16, &RISCV::VRRegClass); + + if (Subtarget.hasStdExtF()) + addRegisterClass(MVT::v4f32, &RISCV::VRRegClass); + + if (Subtarget.hasStdExtD()) + addRegisterClass(MVT::v2f64, &RISCV::VRRegClass); + } } // Compute derived properties from the register classes. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -157,7 +157,8 @@ //===----------------------------------------------------------------------===// class VTypeInfo + ValueType Scal = XLenVT, RegisterClass ScalarReg = GPR, + int NElts = -1> { ValueType Vector = Vec; ValueType Mask = Mas; @@ -166,9 +167,11 @@ LMULInfo LMul = M; ValueType Scalar = Scal; RegisterClass ScalarRegClass = ScalarReg; + int NumElts = NElts; // The pattern fragment which produces the AVL operand, representing the // "natural" vector length for this type. For scalable vectors this is VLMax. - OutPatFrag AVL = VLMax; + OutPatFrag AVL = !if(!eq(NumElts, -1), + VLMax, OutPatFrag<(ops), (ADDI X0, NumElts)>); string ScalarSuffix = !cond(!eq(Scal, XLenVT) : "X", !eq(Scal, f16) : "F16", @@ -254,6 +257,22 @@ } } +// Non-scalable vector types which assume VLEN >= 128. +defset list AllFixedVectors = { + defset list FixedIntegerVectors = { + def V16I8 : VTypeInfo; + def V8I16 : VTypeInfo; + def V4I32 : VTypeInfo; + def V2I64 : VTypeInfo; + } + + defset list FixedFloatVectors = { + def V8F16 : VTypeInfo; + def V4F32 : VTypeInfo; + def V2F64 : VTypeInfo; + } +} // AllFixedVectors + // This functor is used to obtain the int vector type that has the same SEW and // multiplier as the input parameter type class GetIntVTypeInfo diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -62,7 +62,7 @@ } multiclass VPatUSLoadStoreSDNodes { - foreach vti = AllVectors in + foreach vti = !listconcat(AllVectors, AllFixedVectors) in defm "" : VPatUSLoadStoreSDNode; } @@ -109,10 +109,12 @@ multiclass VPatBinarySDNode_VV_VX { - foreach vti = AllIntegerVectors in { + foreach vti = !listconcat(AllIntegerVectors, FixedIntegerVectors) in { def : VPatBinarySDNode_VV; + } + foreach vti = AllIntegerVectors in { def : VPatBinarySDNode_XI { - foreach vti = AllIntegerVectors in { + foreach vti = !listconcat(AllIntegerVectors, FixedIntegerVectors) in { def : VPatBinarySDNode_VV; + } + foreach vti = AllIntegerVectors in { def : VPatBinarySDNode_XI; multiclass VPatBinaryFPSDNode_VV_VF { - foreach vti = AllFloatVectors in { + foreach vti = !listconcat(AllFloatVectors, FixedFloatVectors) in { def : VPatBinarySDNode_VV; + } + foreach vti = AllFloatVectors in { def : VPatBinarySDNode_VF; @@ -516,7 +517,7 @@ (add V8M8, V16M8, V24M8), 8>; defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t, - vbool4_t, vbool2_t, vbool1_t]; + vbool4_t, vbool2_t, vbool1_t, v1i1, v2i1, v4i1, v8i1, v16i1]; def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> { let Size = 64; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+experimental-Zfh,+f,+d -verify-machineinstrs -riscv-use-rvv-for-fixed-vectors < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+experimental-Zfh,+f,+d -verify-machineinstrs -riscv-use-rvv-for-fixed-vectors < %s | FileCheck %s + +define void @fadd_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fadd_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fadd <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fadd_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fadd_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fadd <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fadd_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fadd <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fsub_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fsub_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fsub <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fsub_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fsub_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fsub <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fsub_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fsub_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fsub <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fmul_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fmul_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fmul <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fmul_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fmul_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fmul <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fmul_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fmul_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fmul <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fdiv_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fdiv_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fdiv <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fdiv_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fdiv_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fdiv <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fdiv_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fdiv <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -0,0 +1,887 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-use-rvv-for-fixed-vectors < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-use-rvv-for-fixed-vectors < %s | FileCheck %s + +define void @add_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: add_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = add <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @add_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: add_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = add <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @add_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: add_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = add <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @add_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: add_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = add <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @sub_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: sub_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sub <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @sub_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: sub_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sub <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @sub_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: sub_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sub <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @sub_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: sub_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = sub <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @mul_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: mul_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = mul <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @mul_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: mul_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = mul <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @mul_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: mul_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = mul <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @mul_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: mul_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = mul <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @and_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: and_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = and <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @and_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: and_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = and <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @and_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: and_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = and <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @and_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: and_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = and <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @or_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: or_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = or <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @or_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: or_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = or <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @or_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: or_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = or <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @or_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: or_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = or <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @xor_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: xor_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = xor <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @xor_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: xor_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = xor <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @xor_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: xor_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = xor <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @xor_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: xor_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = xor <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @lshr_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: lshr_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = lshr <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @lshr_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: lshr_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = lshr <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @lshr_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: lshr_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = lshr <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @lshr_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: lshr_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = lshr <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @ashr_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: ashr_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = ashr <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @ashr_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: ashr_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = ashr <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @ashr_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: ashr_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = ashr <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @ashr_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: ashr_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = ashr <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @shl_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: shl_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = shl <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @shl_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: shl_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = shl <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @shl_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: shl_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = shl <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @shl_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: shl_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = shl <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @sdiv_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sdiv <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @sdiv_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sdiv <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @sdiv_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sdiv <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @sdiv_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = sdiv <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @srem_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: srem_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = srem <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @srem_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: srem_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = srem <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @srem_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: srem_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = srem <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @srem_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: srem_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = srem <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @udiv_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: udiv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = udiv <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @udiv_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: udiv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = udiv <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @udiv_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: udiv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = udiv <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @udiv_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: udiv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = udiv <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @urem_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: urem_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = urem <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @urem_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: urem_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = urem <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @urem_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: urem_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = urem <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @urem_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: urem_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = urem <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +}