Index: include/llvm/IR/IntrinsicsARM.td =================================================================== --- include/llvm/IR/IntrinsicsARM.td +++ include/llvm/IR/IntrinsicsARM.td @@ -244,6 +244,10 @@ // Vector Reciprocal Square Root Step. def int_arm_neon_vrsqrts : Neon_2Arg_Intrinsic; + + // Vector Rounding Double Multiply Add/Subtract. + def int_arm_neon_vqrdmlah : Neon_2Arg_Intrinsic; + def int_arm_neon_vqrdmlsh : Neon_2Arg_Intrinsic; } // Vector Subtract. Index: lib/Target/ARM/ARM.td =================================================================== --- lib/Target/ARM/ARM.td +++ lib/Target/ARM/ARM.td @@ -72,6 +72,8 @@ [FeatureNEON]>; def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable support for CRC instructions">; +def FeatureRDMA : SubtargetFeature<"rdma", "HasRDMA", "true", + "Enable Rounding Double Multiply Add/Subtract instructions">; // Cyclone has preferred instructions for zeroing VFP registers, which can // execute in 0 cycles. Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -223,6 +223,8 @@ AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasRDMA : Predicate<"Subtarget->hasRDMA()">, + AssemblerPredicate<"FeatureRDMA", "rdma">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -6050,6 +6050,47 @@ defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>; defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>; + +// RDMA instructions +let Predicates = [HasNEON, HasV8, HasRDMA] in { + multiclass VQRDMLxH opc, + string opcodeStr, string dt, + SDPatternOperator Int> + : N3VInt_HS<0b1, 0b0, opc, 0b1, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, + opcodeStr, dt, Int>; + defm VQRDMLAH: VQRDMLxH<0b1011, "vqrdmlah", "s", int_arm_neon_vqrdmlah>; + defm VQRDMLSH: VQRDMLxH<0b1100, "vqrdmlsh", "s", int_arm_neon_vqrdmlsh>; + + multiclass VQRDMLxH_Scalar + : N3VIntSL_HS<{1,1,1,sub_op}, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, + opcodeStr, dt, Int>; + + defm VQRDMLAH_Scalar : VQRDMLxH_Scalar<0, "vqrdmlah", "s", + int_arm_neon_vqrdmlah>; + defm VQRDMLSH_Scalar : VQRDMLxH_Scalar<1, "vqrdmlsh", "s", + int_arm_neon_vqrdmlsh>; + + def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQRDMLAH_Scalarv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQRDMLAH_Scalarv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +} + + // Cryptography instructions let PostEncoderMethod = "NEONThumb2DataIPostEncoder", DecoderNamespace = "v8Crypto", hasSideEffects = 0 in { Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -181,6 +181,9 @@ /// HasCRC - if true, processor supports CRC instructions bool HasCRC; + /// HasRDMA - if true, the processor supports RDMA instructions + bool HasRDMA; + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing; @@ -308,6 +311,7 @@ bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasRDMA() const { return HasRDMA; } bool hasVirtualization() const { return HasVirtualization; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -165,6 +165,7 @@ HasTrustZone = false; HasCrypto = false; HasCRC = false; + HasRDMA = false; HasZeroCycleZeroing = false; AllowsUnalignedMem = false; Thumb2DSP = false; Index: lib/Target/ARM/AsmParser/ARMAsmParser.cpp =================================================================== --- lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -276,6 +276,9 @@ bool hasD16() const { return STI.getFeatureBits() & ARM::FeatureD16; } + bool hasRDMA() const { + return STI.getFeatureBits() & ARM::FeatureRDMA; + } void SwitchMode() { uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); Index: test/CodeGen/ARM/vqrdml.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/vqrdml.ll @@ -0,0 +1,81 @@ +; RUN: llc -mattr=+rdma < %s | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" +target triple = "thumbv8-elf" + +define <4 x i16> @vqrdmlahs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: vqrdmlahs16: +;CHECK: vqrdmlah.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrdmlahs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vqrdmlahs32: +;CHECK: vqrdmlah.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i16> @vqrdmlahQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: vqrdmlahQs16: +;CHECK: vqrdmlah.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrdmlahQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: vqrdmlahQs32: +;CHECK: vqrdmlah.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqrdmlahQ_lanes16 +; CHECK: vqrdmlah.s16 q0, q0, d2[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> ; <<8 x i16>> [#uses=1] + %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqrdmlahQ_lanes32 +; CHECK: vqrdmlah.s32 q0, q0, d2[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> ; <<4 x i32>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlah_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqrdmlah_lanes16 +; CHECK: vqrdmlah.s16 d0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] + ret <4 x i16> %1 +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlah_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqrdmlah_lanes32 +; CHECK: vqrdmlah.s32 d0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] + ret <2 x i32> %1 +} + +declare <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32>, <4 x i32>) nounwind readnone Index: test/MC/ARM/armv8-extension-rdma.s =================================================================== --- /dev/null +++ test/MC/ARM/armv8-extension-rdma.s @@ -0,0 +1,70 @@ +//RUN: not llvm-mc -triple thumb-none-linux-gnu -mattr=+rdma -mcpu=cortex-a57 -show-encoding < %s 2>%t | FileCheck %s --check-prefix=CHECK-THUMB +//RUN: FileCheck --check-prefix=CHECK-ERROR <%t %s +//RUN: not llvm-mc -triple arm-none-linux-gnu -mattr=+rdma -mcpu=cortex-a57 -show-encoding < %s 2>%t | FileCheck %s --check-prefix=CHECK-ARM +//RUN: FileCheck --check-prefix=CHECK-ERROR <%t %s + + .text + + vqrdmlah.i8 q0, q1, q2 + vqrdmlah.u16 d0, d1, d2 + vqrdmlsh.f32 q3, q4, q5 + vqrdmlsh.f64 d3, d5, d5 + +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlah.i8 q0, q1, q2 +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlah.u16 d0, d1, d2 +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlsh.f32 q3, q4, q5 +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlsh.f64 d3, d5, d5 +//CHECK-ERROR: ^ + + vqrdmlah.s16 q0, q1, q2 +//CHECK-ARM: vqrdmlah.s16 q0, q1, q2 @ encoding: [0x54,0x0b,0x12,0xf3] +//CHECK-THUMB: vqrdmlah.s16 q0, q1, q2 @ encoding: [0x12,0xff,0x54,0x0b] + vqrdmlah.s32 d0, d1, d2 +//CHECK-ARM: vqrdmlah.s32 d0, d1, d2 @ encoding: [0x12,0x0b,0x21,0xf3] +//CHECK-THUMB: vqrdmlah.s32 d0, d1, d2 @ encoding: [0x21,0xff,0x12,0x0b] + vqrdmlsh.s16 q0, q1, q2 +//CHECK-ARM: vqrdmlsh.s16 q0, q1, q2 @ encoding: [0x54,0x0c,0x12,0xf3] +//CHECK-THUMB: vqrdmlsh.s16 q0, q1, q2 @ encoding: [0x12,0xff,0x54,0x0c] + vqrdmlsh.s32 d0, d1, d2 +//CHECK-ARM: vqrdmlsh.s32 d0, d1, d2 @ encoding: [0x12,0x0c,0x21,0xf3] +//CHECK-THUMB: vqrdmlsh.s32 d0, d1, d2 @ encoding: [0x21,0xff,0x12,0x0c] + + + + vqrdmlah.i8 q0, q1, d9[7] + vqrdmlah.u16 d0, d1, d2[3] + vqrdmlsh.f32 q3, q4, d5[1] + vqrdmlsh.f64 d3, d5, d5[0] + +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlah.i8 q0, q1, d9[7] +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlah.u16 d0, d1, d2[3] +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlsh.f32 q3, q4, d5[1] +//CHECK-ERROR: ^ +//CHECK-ERROR: error: invalid operand for instruction +//CHECK-ERROR: vqrdmlsh.f64 d3, d5, d5[0] +//CHECK-ERROR: ^ + + vqrdmlah.s16 q0, q1, d2[0] +//CHECK-ARM: vqrdmlah.s16 q0, q1, d2[0] @ encoding: [0x42,0x0e,0x92,0xf3] +//CHECK-THUMB: vqrdmlah.s16 q0, q1, d2[0] @ encoding: [0x92,0xff,0x42,0x0e] + vqrdmlah.s32 d0, d1, d2[0] +//CHECK-ARM: vqrdmlah.s32 d0, d1, d2[0] @ encoding: [0x42,0x0e,0xa1,0xf2] +//CHECK-THUMB: vqrdmlah.s32 d0, d1, d2[0] @ encoding: [0xa1,0xef,0x42,0x0e] + vqrdmlsh.s16 q0, q1, d2[0] +//CHECK-ARM: vqrdmlsh.s16 q0, q1, d2[0] @ encoding: [0x42,0x0f,0x92,0xf3] +//CHECK-THUMB: vqrdmlsh.s16 q0, q1, d2[0] @ encoding: [0x92,0xff,0x42,0x0f] + vqrdmlsh.s32 d0, d1, d2[0] +//CHECK-ARM: vqrdmlsh.s32 d0, d1, d2[0] @ encoding: [0x42,0x0f,0xa1,0xf2] +//CHECK-THUMB: vqrdmlsh.s32 d0, d1, d2[0] @ encoding: [0xa1,0xef,0x42,0x0f] Index: test/MC/Disassembler/ARM/armv8-extension-rdma-t2.txt =================================================================== --- /dev/null +++ test/MC/Disassembler/ARM/armv8-extension-rdma-t2.txt @@ -0,0 +1,19 @@ +# RUN: llvm-mc -triple thumbv8 -mattr=+rdma --disassemble < %s | FileCheck %s + +0x12,0xff,0x54,0x0b +0x21,0xff,0x12,0x0b +0x12,0xff,0x54,0x0c +0x21,0xff,0x12,0x0c +# CHECK: vqrdmlah.s16 q0, q1, q2 +# CHECK: vqrdmlah.s32 d0, d1, d2 +# CHECK: vqrdmlsh.s16 q0, q1, q2 +# CHECK: vqrdmlsh.s32 d0, d1, d2 + +0x92,0xff,0x42,0x0e +0xa1,0xef,0x42,0x0e +0x92,0xff,0x42,0x0f +0xa1,0xef,0x42,0x0f +# CHECK: vqrdmlah.s16 q0, q1, d2[0] +# CHECK: vqrdmlah.s32 d0, d1, d2[0] +# CHECK: vqrdmlsh.s16 q0, q1, d2[0] +# CHECK: vqrdmlsh.s32 d0, d1, d2[0] Index: test/MC/Disassembler/ARM/armv8-extension-rdma.txt =================================================================== --- /dev/null +++ test/MC/Disassembler/ARM/armv8-extension-rdma.txt @@ -0,0 +1,19 @@ +# RUN: llvm-mc -triple armv8 -mattr=+rdma --disassemble < %s | FileCheck %s + +0x54,0x0b,0x12,0xf3 +0x12,0x0b,0x21,0xf3 +0x54,0x0c,0x12,0xf3 +0x12,0x0c,0x21,0xf3 +# CHECK: vqrdmlah.s16 q0, q1, q2 +# CHECK: vqrdmlah.s32 d0, d1, d2 +# CHECK: vqrdmlsh.s16 q0, q1, q2 +# CHECK: vqrdmlsh.s32 d0, d1, d2 + +0x42,0x0e,0x92,0xf3 +0x42,0x0e,0xa1,0xf2 +0x42,0x0f,0x92,0xf3 +0x42,0x0f,0xa1,0xf2 +# CHECK: vqrdmlah.s16 q0, q1, d2[0] +# CHECK: vqrdmlah.s32 d0, d1, d2[0] +# CHECK: vqrdmlsh.s16 q0, q1, d2[0] +# CHECK: vqrdmlsh.s32 d0, d1, d2[0]