diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td --- a/clang/include/clang/Basic/arm_cde.td +++ b/clang/include/clang/Basic/arm_cde.td @@ -13,6 +13,7 @@ include "arm_mve_defs.td" +// ACLE CDE intrinsic class CDEIntrinsic : Intrinsic { let builtinExtension = "cde"; @@ -40,6 +41,32 @@ def imm_12b : CDEImmediateBits<12>; def imm_13b : CDEImmediateBits<13>; -let pnt = PNT_None, params = T.None in -def cx1 : CDEIntrinsic $cp, $imm)>; +// CX* instructions operating on GPRs +multiclass CDE_CX_m { + defvar cp = (args imm_coproc:$cp); + let pnt = PNT_None, params = T.None in { + def "" : CDEIntrinsic $cp), cgArgs, (? $imm))>; + def a : CDEIntrinsic $cp, $acc), + cgArgs, (? $imm))>; + + def d : + CDEIntrinsic $cp), cgArgs, (? $imm)):$pair, + (or (shl (u64 (xval $pair, 1)), (u64 32)), + (u64 (xval $pair, 0))))>; + def da : + CDEIntrinsic $cp, $acc_lo, $acc_hi), cgArgs, + (? $imm)):$pair, + (or (shl (u64 (xval $pair, 1)), (u64 32)), + (u64 (xval $pair, 0))))>; + } +} + +defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>; +defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>; +defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>; diff --git a/clang/test/CodeGen/arm-cde-gpr.c b/clang/test/CodeGen/arm-cde-gpr.c --- a/clang/test/CodeGen/arm-cde-gpr.c +++ b/clang/test/CodeGen/arm-cde-gpr.c @@ -11,6 +11,150 @@ // CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1(i32 0, i32 123) // CHECK-NEXT: ret i32 [[TMP0]] // -uint32_t test_cx1() { +uint32_t test_cx1(void) { return __arm_cx1(0, 123); } + +// CHECK-LABEL: @test_cx1a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1a(i32 0, i32 [[ACC:%.*]], i32 345) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx1a(uint32_t acc) { + return __arm_cx1a(0, acc, 345); +} + +// CHECK-LABEL: @test_cx1d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx1d(void) { + return __arm_cx1d(1, 567); +} + +// CHECK-LABEL: @test_cx1da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 789) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx1da(uint64_t acc) { + return __arm_cx1da(0, acc, 789); +} + +// CHECK-LABEL: @test_cx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2(i32 0, i32 [[N:%.*]], i32 11) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx2(uint32_t n) { + return __arm_cx2(0, n, 11); +} + +// CHECK-LABEL: @test_cx2a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 22) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx2a(uint32_t acc, uint32_t n) { + return __arm_cx2a(1, acc, n, 22); +} + +// CHECK-LABEL: @test_cx2d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 [[N:%.*]], i32 33) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx2d(uint32_t n) { + return __arm_cx2d(1, n, 33); +} + +// CHECK-LABEL: @test_cx2da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 44) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx2da(uint64_t acc, uint32_t n) { + return __arm_cx2da(0, acc, n, 44); +} + +// CHECK-LABEL: @test_cx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3(i32 0, i32 [[N:%.*]], i32 [[M:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx3(uint32_t n, uint32_t m) { + return __arm_cx3(0, n, m, 1); +} + +// CHECK-LABEL: @test_cx3a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 [[M:%.*]], i32 2) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx3a(uint32_t acc, uint32_t n, uint32_t m) { + return __arm_cx3a(1, acc, n, m, 2); +} + +// CHECK-LABEL: @test_cx3d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 [[N:%.*]], i32 [[M:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx3d(uint32_t n, uint32_t m) { + return __arm_cx3d(1, n, m, 3); +} + +// CHECK-LABEL: @test_cx3da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 [[M:%.*]], i32 4) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx3da(uint64_t acc, uint32_t n, uint32_t m) { + return __arm_cx3da(0, acc, n, m, 4); +} diff --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c --- a/clang/test/Sema/arm-cde-immediates.c +++ b/clang/test/Sema/arm-cde-immediates.c @@ -4,37 +4,62 @@ #include void test_coproc_gcp_instr(int a) { - __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} __builtin_arm_mcr2(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} } void test_coproc(uint32_t a) { (void)__arm_cx1(0, 0); - __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} + __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} __arm_cx1(-1, 0); // expected-error {{argument value -1 is outside the valid range [0, 7]}} __arm_cx1(8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}} + __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}} } -void test_cx(uint32_t a) { +void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) { (void)__arm_cx1(0, 0); - __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} - __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}} - __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}} + __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1a(0, a, a); // expected-error {{argument to '__arm_cx1a' must be a constant integer}} + __arm_cx1a(0, a, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1d(0, a); // expected-error {{argument to '__arm_cx1d' must be a constant integer}} + __arm_cx1d(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1da(0, da, a); // expected-error {{argument to '__arm_cx1da' must be a constant integer}} + __arm_cx1da(0, da, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + + (void)__arm_cx2(0, n, 0); + __arm_cx2(0, n, a); // expected-error {{argument to '__arm_cx2' must be a constant integer}} + __arm_cx2(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2a(0, a, n, a); // expected-error {{argument to '__arm_cx2a' must be a constant integer}} + __arm_cx2a(0, a, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2d(0, n, a); // expected-error {{argument to '__arm_cx2d' must be a constant integer}} + __arm_cx2d(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2da(0, da, n, a); // expected-error {{argument to '__arm_cx2da' must be a constant integer}} + __arm_cx2da(0, da, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + + (void)__arm_cx3(0, n, m, 0); + __arm_cx3(0, n, m, a); // expected-error {{argument to '__arm_cx3' must be a constant integer}} + __arm_cx3(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3a(0, a, n, m, a); // expected-error {{argument to '__arm_cx3a' must be a constant integer}} + __arm_cx3a(0, a, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3d(0, n, m, a); // expected-error {{argument to '__arm_cx3d' must be a constant integer}} + __arm_cx3d(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3da(0, da, n, m, a); // expected-error {{argument to '__arm_cx3da' must be a constant integer}} + __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} } diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1275,9 +1275,30 @@ // CDE (Custom Datapath Extension) -def int_arm_cde_cx1: Intrinsic< - [llvm_i32_ty], - [llvm_i32_ty /* coproc */, llvm_i32_ty /* imm */], - [IntrNoMem, ImmArg<0>, ImmArg<1>]>; +multiclass CDEGPRIntrinsics args> { + def "" : Intrinsic< + [llvm_i32_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_i32_ty], + !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc */], args, + [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + + def d: Intrinsic< + [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def da: Intrinsic< + [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */], + !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc_lo */, + llvm_i32_ty /* acc_hi */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>; +defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>; +defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>; } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -277,6 +277,15 @@ void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, bool Wrapping, bool Predicated); + /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D, + /// CX1DA, CX2D, CX2DA, CX3, CX3DA). + /// \arg \c NumExtraOps number of extra operands besides the coprocossor, + /// the accumulator and the immediate operand, i.e. 0 + /// for CX1*, 1 for CX2*, 2 for CX3* + /// \arg \c HasAccum whether the instruction has an accumulator operand + void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps, + bool HasAccum); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -2809,6 +2818,69 @@ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, + size_t NumExtraOps, bool HasAccum) { + bool IsBigEndian = CurDAG->getDataLayout().isBigEndian(); + SDLoc Loc(N); + SmallVector Ops; + + unsigned OpIdx = 1; + + // Convert and append the immediate operand designating the coprocessor. + SDValue ImmCorpoc = N->getOperand(OpIdx++); + uint32_t ImmCoprocVal = cast(ImmCorpoc)->getZExtValue(); + Ops.push_back(getI32Imm(ImmCoprocVal, Loc)); + + // For accumulating variants copy the low and high order parts of the + // accumulator into a register pair and add it to the operand vector. + if (HasAccum) { + SDValue AccLo = N->getOperand(OpIdx++); + SDValue AccHi = N->getOperand(OpIdx++); + if (IsBigEndian) + std::swap(AccLo, AccHi); + Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0)); + } + + // Copy extra operands as-is. + for (size_t I = 0; I < NumExtraOps; I++) + Ops.push_back(N->getOperand(OpIdx++)); + + // Convert and append the immediate operand + SDValue Imm = N->getOperand(OpIdx); + uint32_t ImmVal = cast(Imm)->getZExtValue(); + Ops.push_back(getI32Imm(ImmVal, Loc)); + + // Accumulating variants are IT-predicable, add predicate operands. + if (HasAccum) { + SDValue Pred = getAL(CurDAG, Loc); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + Ops.push_back(Pred); + Ops.push_back(PredReg); + } + + // Create the CDE intruction + SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops); + SDValue ResultPair = SDValue(InstrNode, 0); + + // The original intrinsic had two outputs, and the output of the dual-register + // CDE instruction is a register pair. We need to extract the two subregisters + // and replace all uses of the original outputs with the extracted + // subregisters. + uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1}; + if (IsBigEndian) + std::swap(SubRegs[0], SubRegs[1]); + + for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) { + if (SDValue(N, ResIdx).use_empty()) + continue; + SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc, + MVT::i32, ResultPair); + ReplaceUses(SDValue(N, ResIdx), SubReg); + } + + CurDAG->RemoveDeadNode(N); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, @@ -4773,6 +4845,40 @@ IntNo == Intrinsic::arm_mve_vdwdup_predicated); return; } + + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: { + bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da || + IntNo == Intrinsic::arm_cde_cx2da || + IntNo == Intrinsic::arm_cde_cx3da; + size_t NumExtraOps; + uint16_t Opcode; + switch (IntNo) { + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + NumExtraOps = 0; + Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D; + break; + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + NumExtraOps = 1; + Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D; + break; + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: + NumExtraOps = 2; + Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum); + return; + } } break; } diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td --- a/llvm/lib/Target/ARM/ARMInstrCDE.td +++ b/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -215,6 +215,35 @@ def CDE_CX3D : CDE_CX3_Instr<"cx3d", cde_cx_params_dual_noacc>; def CDE_CX3DA : CDE_CX3_Instr<"cx3da", cde_cx_params_dual_acc>; +let Predicates = [HasCDE] in { + def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)), + (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + timm:$imm)), + (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + timm:$imm)), + (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, timm:$imm)), + (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3A p_imm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; +} + class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>; class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>; class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>; diff --git a/llvm/test/CodeGen/Thumb2/cde-gpr.ll b/llvm/test/CodeGen/Thumb2/cde-gpr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/cde-gpr.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s + +declare i32 @llvm.arm.cde.cx1(i32 immarg, i32 immarg) +declare i32 @llvm.arm.cde.cx1a(i32 immarg, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx1d(i32 immarg, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx1da(i32 immarg, i32, i32, i32 immarg) + +declare i32 @llvm.arm.cde.cx2(i32 immarg, i32, i32 immarg) +declare i32 @llvm.arm.cde.cx2a(i32 immarg, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx2d(i32 immarg, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx2da(i32 immarg, i32, i32, i32, i32 immarg) + +declare i32 @llvm.arm.cde.cx3(i32 immarg, i32, i32, i32 immarg) +declare i32 @llvm.arm.cde.cx3a(i32 immarg, i32, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx3d(i32 immarg, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx3da(i32 immarg, i32, i32, i32, i32, i32 immarg) + +define arm_aapcs_vfpcc i32 @test_cx1() { +; CHECK-LABEL: test_cx1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1 p0, r0, #123 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx1(i32 0, i32 123) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx1a(i32 %acc) { +; CHECK-LABEL: test_cx1a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1a p0, r0, #345 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx1a(i32 0, i32 %acc, i32 345) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx1d() { +; CHECK-LABEL: test_cx1d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1d p1, r0, r1, #567 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx1da(i64 %acc) { +; CHECK-LABEL: test_cx1da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx1da p0, r0, r1, #789 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 %2, i32 %1, i32 789) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i32 @test_cx2(i32 %n) { +; CHECK-LABEL: test_cx2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2 p0, r0, r0, #11 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx2(i32 0, i32 %n, i32 11) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx2a(i32 %acc, i32 %n) { +; CHECK-LABEL: test_cx2a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2a p1, r0, r1, #22 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx2a(i32 1, i32 %acc, i32 %n, i32 22) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx2d(i32 %n) #0 { +; CHECK-LABEL: test_cx2d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2d p1, r0, r1, r0, #33 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 %n, i32 33) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx2da(i64 %acc, i32 %n) { +; CHECK-LABEL: test_cx2da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx2da p0, r0, r1, r2, #44 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 %2, i32 %1, i32 %n, i32 44) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i32 @test_cx3(i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3 p0, r0, r0, r1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx3(i32 0, i32 %n, i32 %m, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx3a(i32 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3a p1, r0, r1, r2, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx3a(i32 1, i32 %acc, i32 %n, i32 %m, i32 2) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx3d(i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3d p1, r0, r1, r0, r1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 %n, i32 %m, i32 3) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx3da(i64 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx3da p0, r0, r1, r2, r3, #4 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 %2, i32 %1, i32 %n, i32 %m, i32 4) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +}