diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -388,13 +388,56 @@ defm vstrwq: scatter_offset_both; defm vstrdq: scatter_offset_both; -let params = [Void], pnt = PNT_None in -def urshrl: Intrinsic $lo, $hi, $shift):$pair, - (or (shl (u64 (xval $pair, 1)), (u64 32)), - (u64 (xval $pair, 0))))>; +// Base class for the scalar shift intrinsics. +class ScalarShift: + Intrinsic { + let params = [Void]; + let pnt = PNT_None; +} + +// Subclass that includes the machinery to take a 64-bit input apart +// into halves, retrieve the two halves of a shifted output as a pair, +// and glue the pieces of the pair back into an i64 for output. +class LongScalarShift: + ScalarShift; + +// The family of saturating/rounding scalar shifts that take an +// immediate shift count. They come in matched 32- and 64-bit pairs. +multiclass ScalarSaturatingShiftImm { + def "": ScalarShift $value, $sh)>; + def l: LongScalarShift $lo, $hi, $sh)>; +} +defm uqshl: ScalarSaturatingShiftImm; +defm urshr: ScalarSaturatingShiftImm; +defm sqshl: ScalarSaturatingShiftImm; +defm srshr: ScalarSaturatingShiftImm; + +// The family of saturating/rounding scalar shifts that take a +// register shift count. They also have 32- and 64-bit forms, but the +// 64-bit form also has a version that saturates to 48 bits, so the IR +// intrinsic takes an extra saturation-type operand. +multiclass ScalarSaturatingShiftReg { + def "": ScalarShift $value, $sh)>; + def l: LongScalarShift $lo, $hi, $sh, 64)>; + def l_sat48: LongScalarShift $lo, $hi, $sh, 48)>; +} +defm uqrshl: ScalarSaturatingShiftReg; +defm sqrshr: ScalarSaturatingShiftReg; + +// The intrinsics for LSLL and ASRL come in 64-bit versions only, with +// no saturation count. +def lsll: LongScalarShift $lo, $hi, $sh)>; +def asrl: LongScalarShift $lo, $hi, $sh)>; let params = T.Int32 in { def vadcq: Intrinsic:$carry), diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -312,7 +312,7 @@ // imm_1to32 can be in the range 1 to 32, unconditionally. (e.g. scalar shift // intrinsics) -def imm_1to32 : Immediate>; +def imm_1to32 : Immediate>; // imm_1248 can be 1, 2, 4 or 8. (e.g. vidupq) def imm_1248 : Immediate> { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c --- a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c @@ -3,6 +3,237 @@ #include +// CHECK-LABEL: @test_asrl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.asrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_asrl(int64_t value, int32_t shift) +{ + return asrl(value, shift); +} + +// CHECK-LABEL: @test_lsll( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.lsll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_lsll(uint64_t value, int32_t shift) +{ + return lsll(value, shift); +} + +// CHECK-LABEL: @test_sqrshr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqrshr(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_sqrshr(int32_t value, int32_t shift) +{ + return sqrshr(value, shift); +} + +// CHECK-LABEL: @test_sqrshrl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_sqrshrl(int64_t value, int32_t shift) +{ + return sqrshrl(value, shift); +} + +// CHECK-LABEL: @test_sqrshrl_sat48( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_sqrshrl_sat48(int64_t value, int32_t shift) +{ + return sqrshrl_sat48(value, shift); +} + +// CHECK-LABEL: @test_sqshl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqshl(i32 [[VALUE:%.*]], i32 2) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_sqshl(int32_t value) +{ + return sqshl(value, 2); +} + +// CHECK-LABEL: @test_sqshll( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqshll(i32 [[TMP2]], i32 [[TMP1]], i32 17) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_sqshll(int64_t value) +{ + return sqshll(value, 17); +} + +// CHECK-LABEL: @test_srshr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.srshr(i32 [[VALUE:%.*]], i32 6) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_srshr(int32_t value) +{ + return srshr(value, 6); +} + +// CHECK-LABEL: @test_srshrl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.srshrl(i32 [[TMP2]], i32 [[TMP1]], i32 26) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +int64_t test_srshrl(int64_t value) +{ + return srshrl(value, 26); +} + +// CHECK-LABEL: @test_uqrshl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqrshl(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_uqrshl(uint32_t value, int32_t shift) +{ + return uqrshl(value, shift); +} + +// CHECK-LABEL: @test_uqrshll( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_uqrshll(uint64_t value, int32_t shift) +{ + return uqrshll(value, shift); +} + +// CHECK-LABEL: @test_uqrshll_sat48( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_uqrshll_sat48(uint64_t value, int32_t shift) +{ + return uqrshll_sat48(value, shift); +} + +// CHECK-LABEL: @test_uqshl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqshl(i32 [[VALUE:%.*]], i32 21) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_uqshl(uint32_t value) +{ + return uqshl(value, 21); +} + +// CHECK-LABEL: @test_uqshll( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqshll(i32 [[TMP2]], i32 [[TMP1]], i32 16) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_uqshll(uint64_t value) +{ + return uqshll(value, 16); +} + +// CHECK-LABEL: @test_urshr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.urshr(i32 [[VALUE:%.*]], i32 22) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_urshr(uint32_t value) +{ + return urshr(value, 22); +} + // CHECK-LABEL: @test_urshrl( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -850,9 +850,25 @@ [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>; -def int_arm_mve_urshrl: Intrinsic< - [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; +// MVE scalar shifts. +class ARM_MVE_qrshift_single value, + list saturate = []> : + Intrinsic; +multiclass ARM_MVE_qrshift saturate = []> { + // Most of these shifts come in 32- and 64-bit versions. But only + // the 64-bit ones have the extra saturation argument (if any). + def "": ARM_MVE_qrshift_single<[llvm_i32_ty]>; + def l: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty], saturate>; +} +defm int_arm_mve_urshr: ARM_MVE_qrshift; +defm int_arm_mve_uqshl: ARM_MVE_qrshift; +defm int_arm_mve_srshr: ARM_MVE_qrshift; +defm int_arm_mve_sqshl: ARM_MVE_qrshift; +defm int_arm_mve_uqrshl: ARM_MVE_qrshift<[llvm_i32_ty]>; +defm int_arm_mve_sqrshr: ARM_MVE_qrshift<[llvm_i32_ty]>; +// LSLL and ASRL only have 64-bit versions, not 32. +def int_arm_mve_lsll: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>; +def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>; def int_arm_mve_vadc: Intrinsic< [llvm_anyvector_ty, llvm_i32_ty], diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -226,7 +226,8 @@ void SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated); /// SelectMVE_LongShift - Select MVE 64-bit scalar shift intrinsics. - void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate); + void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate, + bool HasSaturationOperand); /// SelectMVE_VADCSBC - Select MVE vector add/sub-with-carry intrinsics. void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, @@ -2399,7 +2400,8 @@ } void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, - bool Immediate) { + bool Immediate, + bool HasSaturationOperand) { SDLoc Loc(N); SmallVector Ops; @@ -2410,11 +2412,18 @@ // The shift count if (Immediate) { int32_t ImmValue = cast(N->getOperand(3))->getZExtValue(); - Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count } else { Ops.push_back(N->getOperand(3)); } + // The immediate saturation operand, if any + if (HasSaturationOperand) { + int32_t SatOp = cast(N->getOperand(4))->getZExtValue(); + int SatBit = (SatOp == 64 ? 0 : 1); + Ops.push_back(getI32Imm(SatBit, Loc)); + } + // MVE scalar shifts are IT-predicable, so include the standard // predicate arguments. Ops.push_back(getAL(CurDAG, Loc)); @@ -4267,7 +4276,28 @@ break; case Intrinsic::arm_mve_urshrl: - SelectMVE_LongShift(N, ARM::MVE_URSHRL, true); + SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); + return; + case Intrinsic::arm_mve_uqshll: + SelectMVE_LongShift(N, ARM::MVE_UQSHLL, true, false); + return; + case Intrinsic::arm_mve_srshrl: + SelectMVE_LongShift(N, ARM::MVE_SRSHRL, true, false); + return; + case Intrinsic::arm_mve_sqshll: + SelectMVE_LongShift(N, ARM::MVE_SQSHLL, true, false); + return; + case Intrinsic::arm_mve_uqrshll: + SelectMVE_LongShift(N, ARM::MVE_UQRSHLL, false, true); + return; + case Intrinsic::arm_mve_sqrshrl: + SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true); + return; + case Intrinsic::arm_mve_lsll: + SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false); + return; + case Intrinsic::arm_mve_asrl: + SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false); return; case Intrinsic::arm_mve_vadc: diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -409,9 +409,12 @@ let Inst{19-16} = RdaDest{3-0}; } -class MVE_ScalarShiftSRegImm op5_4, list pattern=[]> +class MVE_ScalarShiftSRegImm op5_4> : MVE_ScalarShiftSingleReg { + "$RdaSrc, $imm", "$RdaDest = $RdaSrc", + [(set rGPR:$RdaDest, + (i32 (!cast("int_arm_mve_" # iname) + (i32 rGPR:$RdaSrc), (i32 imm:$imm))))]> { bits<5> imm; let Inst{15} = 0b0; @@ -427,9 +430,12 @@ def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>; def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>; -class MVE_ScalarShiftSRegReg op5_4, list pattern=[]> +class MVE_ScalarShiftSRegReg op5_4> : MVE_ScalarShiftSingleReg { + "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", + [(set rGPR:$RdaDest, + (i32 (!cast("int_arm_mve_" # iname) + (i32 rGPR:$RdaSrc), (i32 rGPR:$Rm))))]> { bits<4> Rm; let Inst{15-12} = Rm{3-0}; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll @@ -1,7 +1,264 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s -define arm_aapcs_vfpcc i64 @test_urshrl(i64 %value) { +define i64 @test_asrl(i64 %value, i32 %shift) { +; CHECK-LABEL: test_asrl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: asrl r0, r1, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.asrl(i32 %2, i32 %1, i32 %shift) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.asrl(i32, i32, i32) + +define i64 @test_lsll(i64 %value, i32 %shift) { +; CHECK-LABEL: test_lsll: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsll r0, r1, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.lsll(i32 %2, i32 %1, i32 %shift) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.lsll(i32, i32, i32) + +define i32 @test_sqrshr(i32 %value, i32 %shift) { +; CHECK-LABEL: test_sqrshr: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sqrshr r0, r1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.sqrshr(i32 %value, i32 %shift) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.sqrshr(i32, i32) + +define i64 @test_sqrshrl(i64 %value, i32 %shift) { +; CHECK-LABEL: test_sqrshrl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sqrshrl r0, r1, #64, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 %2, i32 %1, i32 %shift, i32 64) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32) + +define i64 @test_sqrshrl_sat48(i64 %value, i32 %shift) { +; CHECK-LABEL: test_sqrshrl_sat48: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sqrshrl r0, r1, #48, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 %2, i32 %1, i32 %shift, i32 48) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define i32 @test_sqshl(i32 %value) { +; CHECK-LABEL: test_sqshl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sqshl r0, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.sqshl(i32 %value, i32 2) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.sqshl(i32, i32) + +define i64 @test_sqshll(i64 %value) { +; CHECK-LABEL: test_sqshll: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sqshll r0, r1, #17 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.sqshll(i32 %2, i32 %1, i32 17) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.sqshll(i32, i32, i32) + +define i32 @test_srshr(i32 %value) { +; CHECK-LABEL: test_srshr: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: srshr r0, #6 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.srshr(i32 %value, i32 6) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.srshr(i32, i32) + +define i64 @test_srshrl(i64 %value) { +; CHECK-LABEL: test_srshrl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: srshrl r0, r1, #26 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.srshrl(i32 %2, i32 %1, i32 26) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.srshrl(i32, i32, i32) + +define i32 @test_uqrshl(i32 %value, i32 %shift) { +; CHECK-LABEL: test_uqrshl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: uqrshl r0, r1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.uqrshl(i32 %value, i32 %shift) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.uqrshl(i32, i32) + +define i64 @test_uqrshll(i64 %value, i32 %shift) { +; CHECK-LABEL: test_uqrshll: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: uqrshll r0, r1, #64, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 %2, i32 %1, i32 %shift, i32 64) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.uqrshll(i32, i32, i32, i32) + +define i64 @test_uqrshll_sat48(i64 %value, i32 %shift) { +; CHECK-LABEL: test_uqrshll_sat48: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: uqrshll r0, r1, #48, r2 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 %2, i32 %1, i32 %shift, i32 48) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define i32 @test_uqshl(i32 %value) { +; CHECK-LABEL: test_uqshl: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: uqshl r0, #21 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.uqshl(i32 %value, i32 21) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.uqshl(i32, i32) + +define i64 @test_uqshll(i64 %value) { +; CHECK-LABEL: test_uqshll: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: uqshll r0, r1, #16 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %value, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %value to i32 + %3 = call { i32, i32 } @llvm.arm.mve.uqshll(i32 %2, i32 %1, i32 16) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +declare { i32, i32 } @llvm.arm.mve.uqshll(i32, i32, i32) + +define i32 @test_urshr(i32 %value) { +; CHECK-LABEL: test_urshr: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: urshr r0, #22 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.mve.urshr(i32 %value, i32 22) + ret i32 %0 +} + +declare i32 @llvm.arm.mve.urshr(i32, i32) + +define i64 @test_urshrl(i64 %value) { ; CHECK-LABEL: test_urshrl: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: urshrl r0, r1, #6 @@ -10,10 +267,10 @@ %0 = lshr i64 %value, 32 %1 = trunc i64 %0 to i32 %2 = trunc i64 %value to i32 - %3 = tail call { i32, i32 } @llvm.arm.mve.urshrl(i32 %2, i32 %1, i32 6) + %3 = call { i32, i32 } @llvm.arm.mve.urshrl(i32 %2, i32 %1, i32 6) %4 = extractvalue { i32, i32 } %3, 1 %5 = zext i32 %4 to i64 - %6 = shl nuw i64 %5, 32 + %6 = shl i64 %5, 32 %7 = extractvalue { i32, i32 } %3, 0 %8 = zext i32 %7 to i64 %9 = or i64 %6, %8