diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -32,6 +32,7 @@ let params = T.Int in { def vaddq: Intrinsic; def vsubq: Intrinsic; +def vmulq: Intrinsic; } let params = T.Float in { @@ -39,6 +40,8 @@ NameOverride<"vaddq">; def vsubqf: Intrinsic, NameOverride<"vsubq">; +def vmulqf: Intrinsic, + NameOverride<"vmulq">; } let params = T.Usual in { @@ -55,6 +58,9 @@ def vsubq_m: Intrinsic< Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred), (IRInt<"sub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>; +def vmulq_m: Intrinsic< + Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred), + (IRInt<"mul_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>; } let params = T.Int in { diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -58,12 +58,14 @@ let prefix = func # "(Builder, "; } def add: IRBuilder<"CreateAdd">; +def mul: IRBuilder<"CreateMul">; def or: IRBuilder<"CreateOr">; def and: IRBuilder<"CreateAnd">; def sub: IRBuilder<"CreateSub">; def shl: IRBuilder<"CreateShl">; def lshr: IRBuilder<"CreateLShr">; def fadd: IRBuilder<"CreateFAdd">; +def fmul: IRBuilder<"CreateFMul">; def fsub: IRBuilder<"CreateFSub">; def load: IRBuilder<"CreateLoad"> { let special_params = [IRBuilderAddrParam<0>]; diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vmulq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) +{ +#ifdef POLYMORPHIC + return vmulq(a, b); +#else /* POLYMORPHIC */ + return vmulq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmulq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vmulq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmulq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vmulq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -809,6 +809,9 @@ def int_arm_mve_sub_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; +def int_arm_mve_mul_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [IntrNoMem]>; defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1512,8 +1512,9 @@ let Inst{3-1} = Qm{2-0}; } -class MVE_VMULt1 size, list pattern=[]> - : MVE_int<"vmul", suffix, size, pattern> { +class MVE_VMULt1 size, + list pattern=[]> + : MVE_int { let Inst{28} = 0b0; let Inst{25-23} = 0b110; @@ -1524,19 +1525,33 @@ let validForTailPredication = 1; } -def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>; -def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>; -def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>; +multiclass MVE_VMUL_m { + def "" : MVE_VMULt1; -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), - (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), - (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; - def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))), - (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>; + let Predicates = [HasMVEInt] in { + // Unpredicated multiply + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated multiply + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 1), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +multiclass MVE_VMUL + : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>; + +defm MVE_VMULi8 : MVE_VMUL; +defm MVE_VMULi16 : MVE_VMUL; +defm MVE_VMULi32 : MVE_VMUL; + class MVE_VQxDMULH size, bit rounding, list pattern=[]> : MVE_int { @@ -2805,8 +2820,8 @@ let Inst{16} = 0b0; } -class MVE_VMUL_fp pattern=[]> - : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd), +class MVE_VMUL_fp pattern=[]> + : MVEFloatArithNeon { bits<4> Qd; @@ -2824,16 +2839,29 @@ let validForTailPredication = 1; } -def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; -def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>; +multiclass MVE_VMULT_fp_m { + def "" : MVE_VMUL_fp; -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), - (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>; - def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))), - (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (!cast(NAME) + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 1), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } } +multiclass MVE_VMUL_fp_m + : MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>; + +defm MVE_VMULf32 : MVE_VMUL_fp_m; +defm MVE_VMULf16 : MVE_VMUL_fp_m; + class MVE_VCMLA pattern=[]> : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -211,7 +211,7 @@ ; CHECK: renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg - ; CHECK: renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1) ; CHECK: $lr = MVE_LETP renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: @@ -252,7 +252,7 @@ renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg - renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 1, killed renamable $vpr :: (store 16 into %ir.scevgep1, align 1) renamable $lr = t2LoopDec killed renamable $lr, 1 @@ -325,7 +325,7 @@ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2) ; CHECK: renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2) - ; CHECK: renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 2) ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg @@ -358,7 +358,7 @@ MVE_VPST 4, implicit $vpr renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv57, align 2) renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 2) - renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 2) renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg @@ -441,7 +441,7 @@ ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, killed $noreg :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: $r3 = tMOVr $r2, 14, $noreg - ; CHECK: renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg @@ -490,7 +490,7 @@ renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) $r3 = tMOVr $r2, 14, $noreg - renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = mul <4 x i32> %b, %a + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vmulq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f32 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = fmul <4 x float> %b, %a + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) + ret <16 x i8> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) + +define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmulq_m_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive) + ret <8 x half> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp --- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -250,9 +250,9 @@ case MVE_VMUL_qr_i8: case MVE_VMULf16: case MVE_VMULf32: - case MVE_VMULt1i16: - case MVE_VMULt1i8: - case MVE_VMULt1i32: + case MVE_VMULi16: + case MVE_VMULi8: + case MVE_VMULi32: case MVE_VMVN: case MVE_VMVNimmi16: case MVE_VMVNimmi32: