diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5095,6 +5095,13 @@ switch (BuiltinID) { default: break; + case NEON::BI__builtin_neon_vpadd_v: + case NEON::BI__builtin_neon_vpaddq_v: + // We don't allow fp/int overloading of intrinsics. + if (VTy->getElementType()->isFloatingPointTy() && + Int == Intrinsic::aarch64_neon_addp) + Int = Intrinsic::aarch64_neon_faddp; + break; case NEON::BI__builtin_neon_vabs_v: case NEON::BI__builtin_neon_vabsq_v: if (VTy->getElementType()->isFloatingPointTy()) diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c --- a/clang/test/CodeGen/aarch64-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c @@ -4411,7 +4411,7 @@ // CHECK-LABEL: @test_vpadd_f32( // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %a, <2 x float> %b) +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b) // CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> // CHECK: ret <2 x float> [[VPADD_V2_I]] float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { @@ -4475,7 +4475,7 @@ // CHECK-LABEL: @test_vpaddq_f32( // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b) +// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b) // CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8> // CHECK: ret <4 x float> [[VPADDQ_V2_I]] float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) { @@ -4485,7 +4485,7 @@ // CHECK-LABEL: @test_vpaddq_f64( // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %a, <2 x double> %b) +// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b) // CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8> // CHECK: ret <2 x double> [[VPADDQ_V2_I]] float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) { diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -736,14 +736,14 @@ } // CHECK-LABEL: test_vpadd_f16 -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.addp.v4f16(<4 x half> %a, <4 x half> %b) +// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %a, <4 x half> %b) // CHECK: ret <4 x half> [[ADD]] float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { return vpadd_f16(a, b); } // CHECK-LABEL: test_vpaddq_f16 -// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.addp.v8f16(<8 x half> %a, <8 x half> %b) +// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> %a, <8 x half> %b) // CHECK: ret <8 x half> [[ADD]] float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) { return vpaddq_f16(a, b); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -289,6 +289,7 @@ // Pairwise Add def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_faddp : AdvSIMD_2VectorArg_Intrinsic; // Long Pairwise Add // FIXME: In theory, we shouldn't need intrinsics for saddlp or diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -568,6 +568,17 @@ NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer); return true; } + if (Name.startswith("aarch64.neon.addp")) { + if (F->arg_size() != 2) + break; // Invalid IR. + auto fArgs = F->getFunctionType()->params(); + VectorType *ArgTy = dyn_cast(fArgs[0]); + if (ArgTy && ArgTy->getElementType()->isFloatingPointTy()) { + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::aarch64_neon_faddp, fArgs); + return true; + } + } break; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3499,7 +3499,7 @@ } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -34,9 +34,6 @@ private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const; - - bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; }; } // End llvm namespace. #endif diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -64,11 +64,6 @@ return std::make_pair(0, EltTy); }); - // HACK: Check that the intrinsic isn't ambiguous. - // (See: https://bugs.llvm.org/show_bug.cgi?id=40968) - getActionDefinitionsBuilder(G_INTRINSIC) - .custom(); - getActionDefinitionsBuilder(G_PHI) .legalFor({p0, s16, s32, s64}) .clampScalar(0, s16, s64) @@ -517,30 +512,11 @@ return false; case TargetOpcode::G_VAARG: return legalizeVaArg(MI, MRI, MIRBuilder); - case TargetOpcode::G_INTRINSIC: - return legalizeIntrinsic(MI, MRI, MIRBuilder); } llvm_unreachable("expected switch to return"); } -bool AArch64LegalizerInfo::legalizeIntrinsic( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - // HACK: Don't allow faddp/addp for now. We don't pass down the type info - // necessary to get this right today. - // - // It looks like addp/faddp is the only intrinsic that's impacted by this. - // All other intrinsics fully describe the required types in their names. - // - // (See: https://bugs.llvm.org/show_bug.cgi?id=40968) - const MachineOperand &IntrinOp = MI.getOperand(1); - if (IntrinOp.isIntrinsicID() && - IntrinOp.getIntrinsicID() == Intrinsic::aarch64_neon_addp) - return false; - return true; -} - bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir b/llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir +++ /dev/null @@ -1,32 +0,0 @@ -# RUN: llc -mtriple aarch64-unknown-unknown -O0 -start-before=legalizer -pass-remarks-missed=gisel* %s -o - 2>&1 | FileCheck %s -# -# Check that we fall back on @llvm.aarch64.neon.addp and ensure that we get the -# correct instruction. -# https://bugs.llvm.org/show_bug.cgi?id=40968 - ---- | - define <2 x float> @foo(<2 x float> %v1, <2 x float> %v2) { - entry: - %v3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %v1, <2 x float> %v2) - ret <2 x float> %v3 - } - declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) -... ---- -name: foo -alignment: 2 -tracksRegLiveness: true -body: | - bb.1.entry: - liveins: $d0, $d1 - ; CHECK: remark: - ; CHECK-SAME: unable to legalize instruction: %2:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.addp), %0:_(<2 x s32>), %1:_(<2 x s32>) - ; CHECK: faddp - ; CHECK-NOT: addp - %0:_(<2 x s32>) = COPY $d0 - %1:_(<2 x s32>) = COPY $d1 - %2:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.addp), %0(<2 x s32>), %1(<2 x s32>) - $d0 = COPY %2(<2 x s32>) - RET_ReallyLR implicit $d0 - -... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -151,7 +151,7 @@ # DEBUG: .. the first uncovered type index: 1, OK # # DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices -# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG: .. type index coverage check SKIPPED: no rules defined # # DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices # DEBUG: .. type index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll @@ -65,27 +65,27 @@ ret <2 x i64> %val } -declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) -declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) -declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) +declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { ; CHECK: test_faddp_v2f32: - %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs) + %val = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %lhs, <2 x float> %rhs) ; CHECK: faddp v0.2s, v0.2s, v1.2s ret <2 x float> %val } define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK: test_faddp_v4f32: - %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs) + %val = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %lhs, <4 x float> %rhs) ; CHECK: faddp v0.4s, v0.4s, v1.4s ret <4 x float> %val } define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { ; CHECK: test_faddp_v2f64: - %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs) + %val = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %lhs, <2 x double> %rhs) ; CHECK: faddp v0.2d, v0.2d, v1.2d ret <2 x double> %val } diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll @@ -712,7 +712,7 @@ ;CHECK: faddp.2s %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B - %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -721,7 +721,7 @@ ;CHECK: faddp.4s %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B - %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) ret <4 x float> %tmp3 } @@ -730,13 +730,13 @@ ;CHECK: faddp.2d %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B - %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) + %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) ret <2 x double> %tmp3 } -declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone -declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone +declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: uaddl_duprhs diff --git a/llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll b/llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll @@ -0,0 +1,9 @@ +; RUN: opt -S < %s -mtriple=arm64 | FileCheck %s +declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) + +; CHECK: call <4 x float> @llvm.aarch64.neon.faddp.v4f32 +define <4 x float> @upgrade_aarch64_neon_addp_float(<4 x float> %a, <4 x float> %b) { + %res = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %res +} +