diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -153,7 +153,18 @@ FMLSv4f32_OP1, FMLSv4f32_OP2, FMLSv4i32_indexed_OP1, - FMLSv4i32_indexed_OP2 + FMLSv4i32_indexed_OP2, + + FMULv2i32_indexed_OP1, + FMULv2i32_indexed_OP2, + FMULv2i64_indexed_OP1, + FMULv2i64_indexed_OP2, + FMULv4i16_indexed_OP1, + FMULv4i16_indexed_OP2, + FMULv4i32_indexed_OP1, + FMULv4i32_indexed_OP2, + FMULv8i16_indexed_OP1, + FMULv8i16_indexed_OP2, }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4917,6 +4917,55 @@ return Found; } +static bool getFMULPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + auto Match = [&](unsigned Opcode, int Operand, + MachineCombinerPattern Pattern) -> bool { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineOperand &MO = Root.getOperand(Operand); + MachineInstr *MI = nullptr; + if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) + MI = MRI.getUniqueVRegDef(MO.getReg()); + if (MI && MI->getOpcode() == Opcode) { + Patterns.push_back(Pattern); + return true; + } + return false; + }; + + typedef MachineCombinerPattern MCP; + + switch (Root.getOpcode()) { + default: + return false; + case AArch64::FMULv2f32: + Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); + Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); + break; + case AArch64::FMULv2f64: + Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); + Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); + break; + case AArch64::FMULv4f16: + Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); + Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); + break; + case AArch64::FMULv4f32: + Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); + Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); + break; + case AArch64::FMULv8f16: + Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); + Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); + break; + } + + return Found; +} + /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. /// \param Pattern - combiner pattern @@ -4980,6 +5029,16 @@ case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::FMULv2i32_indexed_OP1: + case MachineCombinerPattern::FMULv2i32_indexed_OP2: + case MachineCombinerPattern::FMULv2i64_indexed_OP1: + case MachineCombinerPattern::FMULv2i64_indexed_OP2: + case MachineCombinerPattern::FMULv4i16_indexed_OP1: + case MachineCombinerPattern::FMULv4i16_indexed_OP2: + case MachineCombinerPattern::FMULv4i32_indexed_OP1: + case MachineCombinerPattern::FMULv4i32_indexed_OP2: + case MachineCombinerPattern::FMULv8i16_indexed_OP1: + case MachineCombinerPattern::FMULv8i16_indexed_OP2: case MachineCombinerPattern::MULADDv8i8_OP1: case MachineCombinerPattern::MULADDv8i8_OP2: case MachineCombinerPattern::MULADDv16i8_OP1: @@ -5036,6 +5095,8 @@ if (getMaddPatterns(Root, Patterns)) return true; // Floating point patterns + if (getFMULPatterns(Root, Patterns)) + return true; if (getFMAPatterns(Root, Patterns)) return true; @@ -5124,6 +5185,42 @@ return MUL; } +/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) +static MachineInstr * +genIndexedMultiply(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + unsigned IdxDupOp, unsigned MulOpc, + const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { + assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && + "Invalid index of FMUL operand"); + + MachineFunction &MF = *Root.getMF(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + MachineInstr *Dup = + MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); + + Register DupSrcReg = Dup->getOperand(1).getReg(); + MRI.clearKillFlags(DupSrcReg); + MRI.constrainRegClass(DupSrcReg, RC); + + unsigned DupSrcLane = Dup->getOperand(2).getImm(); + + unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; + MachineOperand &MulOp = Root.getOperand(IdxMulOp); + + Register ResultReg = Root.getOperand(0).getReg(); + + MachineInstrBuilder MIB; + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MulOpc), ResultReg) + .add(MulOp) + .addReg(DupSrcReg) + .addImm(DupSrcLane); + + InsInstrs.push_back(MIB); + return &Root; +} + /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate /// instructions. /// @@ -6082,12 +6179,53 @@ } break; } + case MachineCombinerPattern::FMULv2i32_indexed_OP1: + case MachineCombinerPattern::FMULv2i32_indexed_OP2: { + unsigned IdxDupOp = + (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; + genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, + &AArch64::FPR128RegClass, MRI); + break; + } + case MachineCombinerPattern::FMULv2i64_indexed_OP1: + case MachineCombinerPattern::FMULv2i64_indexed_OP2: { + unsigned IdxDupOp = + (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; + genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, + &AArch64::FPR128RegClass, MRI); + break; + } + case MachineCombinerPattern::FMULv4i16_indexed_OP1: + case MachineCombinerPattern::FMULv4i16_indexed_OP2: { + unsigned IdxDupOp = + (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; + genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, + &AArch64::FPR128_loRegClass, MRI); + break; + } + case MachineCombinerPattern::FMULv4i32_indexed_OP1: + case MachineCombinerPattern::FMULv4i32_indexed_OP2: { + unsigned IdxDupOp = + (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; + genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, + &AArch64::FPR128RegClass, MRI); + break; + } + case MachineCombinerPattern::FMULv8i16_indexed_OP1: + case MachineCombinerPattern::FMULv8i16_indexed_OP2: { + unsigned IdxDupOp = + (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; + genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, + &AArch64::FPR128_loRegClass, MRI); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and // CodeGen/AArch64/urem-seteq-nonzero.ll. // assert(MUL && "MUL was never set"); - DelInstrs.push_back(MUL); + if (MUL) + DelInstrs.push_back(MUL); DelInstrs.push_back(&Root); } diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll --- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs | FileCheck %s + define void @foo_2d(double* %src) { ; CHECK-LABEL: %entry ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} @@ -134,3 +135,128 @@ for.end: ; preds = %for.body ret void } + +define void @indexed_2s(<2 x float> %shuf, <2 x float> %add, + <2 x float>* %pmul, <2 x float>* %pret) { +; CHECK-LABEL: %entry +; CHECK: for.body +; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; +entry: + %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %inext, %for.body ] + %pmul_i = getelementptr inbounds <2 x float>, <2 x float>* %pmul, i64 %i + %pret_i = getelementptr inbounds <2 x float>, <2 x float>* %pret, i64 %i + + %mul_i = load <2 x float>, <2 x float>* %pmul_i + + %mul = fmul fast <2 x float> %mul_i, %shuffle + %muladd = fadd fast <2 x float> %mul, %add + + store <2 x float> %muladd, <2 x float>* %pret_i, align 16 + %inext = add i64 %i, 1 + br label %for.body +} + +define void @indexed_2d(<2 x double> %shuf, <2 x double> %add, + <2 x double>* %pmul, <2 x double>* %pret) { +; CHECK-LABEL: %entry +; CHECK: for.body +; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; +entry: + %shuffle = shufflevector <2 x double> %shuf, <2 x double> undef, <2 x i32> zeroinitializer + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %inext, %for.body ] + %pmul_i = getelementptr inbounds <2 x double>, <2 x double>* %pmul, i64 %i + %pret_i = getelementptr inbounds <2 x double>, <2 x double>* %pret, i64 %i + + %mul_i = load <2 x double>, <2 x double>* %pmul_i + + %mul = fmul fast <2 x double> %mul_i, %shuffle + %muladd = fadd fast <2 x double> %mul, %add + + store <2 x double> %muladd, <2 x double>* %pret_i, align 16 + %inext = add i64 %i, 1 + br label %for.body +} + +define void @indexed_4s(<4 x float> %shuf, <4 x float> %add, + <4 x float>* %pmul, <4 x float>* %pret) { +; CHECK-LABEL: %entry +; CHECK: for.body +; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; +entry: + %shuffle = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> zeroinitializer + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %inext, %for.body ] + %pmul_i = getelementptr inbounds <4 x float>, <4 x float>* %pmul, i64 %i + %pret_i = getelementptr inbounds <4 x float>, <4 x float>* %pret, i64 %i + + %mul_i = load <4 x float>, <4 x float>* %pmul_i + + %mul = fmul fast <4 x float> %mul_i, %shuffle + %muladd = fadd fast <4 x float> %mul, %add + + store <4 x float> %muladd, <4 x float>* %pret_i, align 16 + %inext = add i64 %i, 1 + br label %for.body +} + +define void @indexed_4h(<4 x half> %shuf, <4 x half> %add, + <4 x half>* %pmul, <4 x half>* %pret) { +; CHECK-LABEL: %entry +; CHECK: for.body +; CHECK: fmla.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; +entry: + %shuffle = shufflevector <4 x half> %shuf, <4 x half> undef, <4 x i32> zeroinitializer + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %inext, %for.body ] + %pmul_i = getelementptr inbounds <4 x half>, <4 x half>* %pmul, i64 %i + %pret_i = getelementptr inbounds <4 x half>, <4 x half>* %pret, i64 %i + + %mul_i = load <4 x half>, <4 x half>* %pmul_i + + %mul = fmul fast <4 x half> %mul_i, %shuffle + %muladd = fadd fast <4 x half> %mul, %add + + store <4 x half> %muladd, <4 x half>* %pret_i, align 16 + %inext = add i64 %i, 1 + br label %for.body +} + +define void @indexed_8h(<8 x half> %shuf, <8 x half> %add, + <8 x half>* %pmul, <8 x half>* %pret) { +; CHECK-LABEL: %entry +; CHECK: for.body +; CHECK: fmla.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; +entry: + %shuffle = shufflevector <8 x half> %shuf, <8 x half> undef, <8 x i32> zeroinitializer + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %inext, %for.body ] + %pmul_i = getelementptr inbounds <8 x half>, <8 x half>* %pmul, i64 %i + %pret_i = getelementptr inbounds <8 x half>, <8 x half>* %pret, i64 %i + + %mul_i = load <8 x half>, <8 x half>* %pmul_i + + %mul = fmul fast <8 x half> %mul_i, %shuffle + %muladd = fadd fast <8 x half> %mul, %add + + store <8 x half> %muladd, <8 x half>* %pret_i, align 16 + %inext = add i64 %i, 1 + br label %for.body +} diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir b/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir @@ -0,0 +1,547 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=machine-combiner -o - -simplify-mir -mtriple=aarch64-unknown-linux-gnu -mattr=+fullfp16 -verify-machineinstrs %s | FileCheck %s +--- | + ; ModuleID = 'lit.ll' + source_filename = "lit.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + define void @indexed_2s(<2 x float> %shuf, <2 x float> %mu, <2 x float> %ad, <2 x float>* %ret) #0 { + entry: + %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer + br label %for.cond + + for.cond: ; preds = %for.cond, %entry + %mul = fmul <2 x float> %mu, %shuffle + %add = fadd <2 x float> %mul, %ad + store <2 x float> %add, <2 x float>* %ret, align 16 + br label %for.cond + } + + define void @indexed_2s_rev(<2 x float> %shuf, <2 x float> %mu, <2 x float> %ad, <2 x float>* %ret) #0 { + entry: + %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer + br label %for.cond + + for.cond: ; preds = %for.cond, %entry + %mul = fmul <2 x float> %shuffle, %mu + %add = fadd <2 x float> %mul, %ad + store <2 x float> %add, <2 x float>* %ret, align 16 + br label %for.cond + } + + define void @indexed_2d(<2 x double> %shuf, <2 x double> %mu, <2 x double> %ad, <2 x double>* %ret) #0 { + entry: + %shuffle = shufflevector <2 x double> %shuf, <2 x double> undef, <2 x i32> zeroinitializer + br label %for.cond + + for.cond: ; preds = %for.cond, %entry + %mul = fmul <2 x double> %mu, %shuffle + %add = fadd <2 x double> %mul, %ad + store <2 x double> %add, <2 x double>* %ret, align 16 + br label %for.cond + } + + define void @indexed_4s(<4 x float> %shuf, <4 x float> %mu, <4 x float> %ad, <4 x float>* %ret) #0 { + entry: + %shuffle = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> zeroinitializer + br label %for.cond + + for.cond: ; preds = %for.cond, %entry + %mul = fmul <4 x float> %mu, %shuffle + %add = fadd <4 x float> %mul, %ad + store <4 x float> %add, <4 x float>* %ret, align 16 + br label %for.cond + } + + define void @indexed_4h(<4 x half> %shuf, <4 x half> %mu, <4 x half> %ad, <4 x half>* %ret) #0 { + entry: + %shuffle = shufflevector <4 x half> %shuf, <4 x half> undef, <4 x i32> zeroinitializer + br label %for.cond + + for.cond: + %mul = fmul <4 x half> %mu, %shuffle + %add = fadd <4 x half> %mul, %ad + store <4 x half> %add, <4 x half>* %ret, align 16 + br label %for.cond + } + + define void @indexed_8h(<8 x half> %shuf, <8 x half> %mu, <8 x half> %ad, <8 x half>* %ret) #0 { + entry: + %shuffle = shufflevector <8 x half> %shuf, <8 x half> undef, <8 x i32> zeroinitializer + br label %for.cond + + for.cond: + %mul = fmul <8 x half> %mu, %shuffle + %add = fadd <8 x half> %mul, %ad + store <8 x half> %add, <8 x half>* %ret, align 16 + br label %for.cond + } + + define void @kill_state(<2 x float> %shuf, <2 x float> %mu, <2 x float> %ad, + <2 x float>* %ret, <2 x float>* %ret2, float %f) #0 { + entry: + %zero_elem = extractelement <2 x float> %shuf, i32 0 + %ins = insertelement <2 x float> undef, float %zero_elem, i32 0 + %shuffle = shufflevector <2 x float> %ins, <2 x float> undef, <2 x i32> zeroinitializer + %ins2 = insertelement <2 x float> %ins, float %f, i32 1 + store <2 x float> %ins2, <2 x float>* %ret2, align 8 + br label %for.cond + + for.cond: ; preds = %for.cond, %entry + %mul = fmul <2 x float> %mu, %shuffle + %add = fadd <2 x float> %mul, %ad + store <2 x float> %add, <2 x float>* %ret, align 16 + br label %for.cond + } + + attributes #0 = { "target-cpu"="cortex-a57" } + +... +--- +name: indexed_2s +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr64 } + - { id: 6, class: fpr64 } + - { id: 7, class: fpr128 } + - { id: 8, class: fpr128 } + - { id: 9, class: fpr64 } + - { id: 10, class: fpr64 } +liveins: + - { reg: '$d0', virtual-reg: '%1' } + - { reg: '$d1', virtual-reg: '%2' } + - { reg: '$d2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_2s + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $d2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY3]], %subreg.dsub + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fpr64 = COPY [[COPY2]] + ; CHECK-NEXT: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv2i32_indexed:%[0-9]+]]:fpr64 = FMULv2i32_indexed [[COPY5]], [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: [[FADDv2f32_:%[0-9]+]]:fpr64 = FADDv2f32 killed [[FMULv2i32_indexed]], [[COPY4]] + ; CHECK-NEXT: STRDui killed [[FADDv2f32_]], [[COPY]], 0 :: (store (s64) into %ir.ret, align 16) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $d0, $d1, $d2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr64 = COPY $d2 + %2:fpr64 = COPY $d1 + %1:fpr64 = COPY $d0 + %8:fpr128 = IMPLICIT_DEF + %7:fpr128 = INSERT_SUBREG %8, %1, %subreg.dsub + %6:fpr64 = COPY %3 + %5:fpr64 = COPY %2 + %0:fpr64 = DUPv2i32lane killed %7, 0 + + bb.1.for.cond: + %9:fpr64 = FMULv2f32 %5, %0 + %10:fpr64 = FADDv2f32 killed %9, %6 + STRDui killed %10, %4, 0 :: (store 8 into %ir.ret, align 16) + B %bb.1 + +... +--- +name: indexed_2s_rev +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr64 } + - { id: 6, class: fpr64 } + - { id: 7, class: fpr128 } + - { id: 8, class: fpr128 } + - { id: 9, class: fpr64 } + - { id: 10, class: fpr64 } +liveins: + - { reg: '$d0', virtual-reg: '%1' } + - { reg: '$d1', virtual-reg: '%2' } + - { reg: '$d2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_2s_rev + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $d2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY3]], %subreg.dsub + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fpr64 = COPY [[COPY2]] + ; CHECK-NEXT: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv2i32_indexed:%[0-9]+]]:fpr64 = FMULv2i32_indexed [[COPY5]], [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: [[FADDv2f32_:%[0-9]+]]:fpr64 = FADDv2f32 killed [[FMULv2i32_indexed]], [[COPY4]] + ; CHECK-NEXT: STRDui killed [[FADDv2f32_]], [[COPY]], 0 :: (store (s64) into %ir.ret, align 16) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $d0, $d1, $d2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr64 = COPY $d2 + %2:fpr64 = COPY $d1 + %1:fpr64 = COPY $d0 + %8:fpr128 = IMPLICIT_DEF + %7:fpr128 = INSERT_SUBREG %8, %1, %subreg.dsub + %6:fpr64 = COPY %3 + %5:fpr64 = COPY %2 + %0:fpr64 = DUPv2i32lane killed %7, 0 + + bb.1.for.cond: + %9:fpr64 = FMULv2f32 %0, %5 + %10:fpr64 = FADDv2f32 killed %9, %6 + STRDui killed %10, %4, 0 :: (store 8 into %ir.ret, align 16) + B %bb.1 + +... +--- +name: indexed_2d +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } + - { id: 7, class: fpr128 } + - { id: 8, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%1' } + - { reg: '$q1', virtual-reg: '%2' } + - { reg: '$q2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_2d + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fpr128 = COPY [[COPY2]] + ; CHECK-NEXT: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY3]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv2i64_indexed:%[0-9]+]]:fpr128 = FMULv2i64_indexed [[COPY5]], [[COPY3]], 0 + ; CHECK-NEXT: [[FADDv2f64_:%[0-9]+]]:fpr128 = FADDv2f64 killed [[FMULv2i64_indexed]], [[COPY4]] + ; CHECK-NEXT: STRQui killed [[FADDv2f64_]], [[COPY]], 0 :: (store (s128) into %ir.ret) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $q0, $q1, $q2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr128 = COPY $q2 + %2:fpr128 = COPY $q1 + %1:fpr128 = COPY $q0 + %6:fpr128 = COPY %3 + %5:fpr128 = COPY %2 + %0:fpr128 = DUPv2i64lane %1, 0 + + bb.1.for.cond: + %7:fpr128 = FMULv2f64 %5, %0 + %8:fpr128 = FADDv2f64 killed %7, %6 + STRQui killed %8, %4, 0 :: (store 16 into %ir.ret) + B %bb.1 + +... +--- +name: indexed_4s +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } + - { id: 7, class: fpr128 } + - { id: 8, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%1' } + - { reg: '$q1', virtual-reg: '%2' } + - { reg: '$q2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_4s + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fpr128 = COPY [[COPY2]] + ; CHECK-NEXT: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY3]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv4i32_indexed:%[0-9]+]]:fpr128 = FMULv4i32_indexed [[COPY5]], [[COPY3]], 0 + ; CHECK-NEXT: [[FADDv4f32_:%[0-9]+]]:fpr128 = FADDv4f32 killed [[FMULv4i32_indexed]], [[COPY4]] + ; CHECK-NEXT: STRQui killed [[FADDv4f32_]], [[COPY]], 0 :: (store (s128) into %ir.ret) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $q0, $q1, $q2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr128 = COPY $q2 + %2:fpr128 = COPY $q1 + %1:fpr128 = COPY $q0 + %6:fpr128 = COPY %3 + %5:fpr128 = COPY %2 + %0:fpr128 = DUPv4i32lane %1, 0 + + bb.1.for.cond: + %7:fpr128 = FMULv4f32 %5, %0 + %8:fpr128 = FADDv4f32 killed %7, %6 + STRQui killed %8, %4, 0 :: (store 16 into %ir.ret) + B %bb.1 + +... +--- +name: indexed_4h +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } + - { id: 7, class: fpr64 } + - { id: 8, class: fpr64 } +liveins: + - { reg: '$d0', virtual-reg: '%1' } + - { reg: '$d1', virtual-reg: '%2' } + - { reg: '$d2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_4h + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $d2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128_lo = INSERT_SUBREG [[DEF]], [[COPY3]], %subreg.dsub + ; CHECK-NEXT: [[DUPv4i16lane:%[0-9]+]]:fpr64 = DUPv4i16lane [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv4i16_indexed:%[0-9]+]]:fpr64 = FMULv4i16_indexed [[COPY2]], [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: [[FADDv4f16_:%[0-9]+]]:fpr64 = FADDv4f16 killed [[FMULv4i16_indexed]], [[COPY1]] + ; CHECK-NEXT: STRDui killed [[FADDv4f16_]], [[COPY]], 0 :: (store (s64) into %ir.ret, align 16) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $d0, $d1, $d2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr64 = COPY $d2 + %2:fpr64 = COPY $d1 + %1:fpr64 = COPY $d0 + %6:fpr128 = IMPLICIT_DEF + %5:fpr128 = INSERT_SUBREG %6, %1, %subreg.dsub + %0:fpr64 = DUPv4i16lane killed %5, 0 + + bb.1.for.cond: + %7:fpr64 = FMULv4f16 %2, %0 + %8:fpr64 = FADDv4f16 killed %7, %3 + STRDui killed %8, %4, 0 :: (store 8 into %ir.ret, align 16) + B %bb.1 + +... +--- +name: indexed_8h +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: gpr64common } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%1' } + - { reg: '$q1', virtual-reg: '%2' } + - { reg: '$q2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: indexed_8h + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128_lo = COPY $q0 + ; CHECK-NEXT: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY3]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv8i16_indexed:%[0-9]+]]:fpr128 = FMULv8i16_indexed [[COPY2]], [[COPY3]], 0 + ; CHECK-NEXT: [[FADDv8f16_:%[0-9]+]]:fpr128 = FADDv8f16 killed [[FMULv8i16_indexed]], [[COPY1]] + ; CHECK-NEXT: STRQui killed [[FADDv8f16_]], [[COPY]], 0 :: (store (s128) into %ir.ret) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $q0, $q1, $q2, $x0 + + %4:gpr64common = COPY $x0 + %3:fpr128 = COPY $q2 + %2:fpr128 = COPY $q1 + %1:fpr128 = COPY $q0 + %0:fpr128 = DUPv8i16lane %1, 0 + + bb.1.for.cond: + %5:fpr128 = FMULv8f16 %2, %0 + %6:fpr128 = FADDv8f16 killed %5, %3 + STRQui killed %6, %4, 0 :: (store 16 into %ir.ret) + B %bb.1 + +... +--- +name: kill_state +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: gpr64common } + - { id: 5, class: gpr64common } + - { id: 6, class: fpr32 } + - { id: 7, class: fpr64 } + - { id: 8, class: fpr64 } + - { id: 9, class: fpr128 } + - { id: 10, class: fpr128 } + - { id: 11, class: fpr128 } + - { id: 12, class: fpr128 } + - { id: 13, class: fpr128 } + - { id: 14, class: fpr64 } + - { id: 15, class: fpr64 } + - { id: 16, class: fpr64 } +liveins: + - { reg: '$d0', virtual-reg: '%1' } + - { reg: '$d1', virtual-reg: '%2' } + - { reg: '$d2', virtual-reg: '%3' } + - { reg: '$x0', virtual-reg: '%4' } + - { reg: '$x1', virtual-reg: '%5' } + - { reg: '$s3', virtual-reg: '%6' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: kill_state + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $d2, $x0, $x1, $s3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $s3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr64 = COPY $d2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY5]], %subreg.dsub + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:fpr64 = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fpr64 = COPY [[COPY4]] + ; CHECK-NEXT: [[DUPv2i32lane:%[0-9]+]]:fpr64 = DUPv2i32lane [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.ssub + ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG]], 1, killed [[INSERT_SUBREG1]], 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fpr64 = COPY [[INSvi32lane]].dsub + ; CHECK-NEXT: STRDui killed [[COPY8]], [[COPY1]], 0 :: (store (s64) into %ir.ret2) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond: + ; CHECK-NEXT: [[FMULv2i32_indexed:%[0-9]+]]:fpr64 = FMULv2i32_indexed [[COPY7]], [[INSERT_SUBREG]], 0 + ; CHECK-NEXT: [[FADDv2f32_:%[0-9]+]]:fpr64 = FADDv2f32 killed [[FMULv2i32_indexed]], [[COPY6]] + ; CHECK-NEXT: STRDui killed [[FADDv2f32_]], [[COPY2]], 0 :: (store (s64) into %ir.ret, align 16) + ; CHECK-NEXT: B %bb.1 + bb.0.entry: + liveins: $d0, $d1, $d2, $x0, $x1, $s3 + + %6:fpr32 = COPY $s3 + %5:gpr64common = COPY $x1 + %4:gpr64common = COPY $x0 + %3:fpr64 = COPY $d2 + %2:fpr64 = COPY $d1 + %1:fpr64 = COPY $d0 + %10:fpr128 = IMPLICIT_DEF + %9:fpr128 = INSERT_SUBREG %10, %1, %subreg.dsub + %8:fpr64 = COPY %3 + %7:fpr64 = COPY %2 + %0:fpr64 = DUPv2i32lane %9, 0 + %12:fpr128 = IMPLICIT_DEF + %11:fpr128 = INSERT_SUBREG %12, %6, %subreg.ssub + %13:fpr128 = INSvi32lane killed %9, 1, killed %11, 0 + %14:fpr64 = COPY %13.dsub + STRDui killed %14, %5, 0 :: (store (s64) into %ir.ret2) + + bb.1.for.cond: + %15:fpr64 = FMULv2f32 %7, %0 + %16:fpr64 = FADDv2f32 killed %15, %8 + STRDui killed %16, %4, 0 :: (store (s64) into %ir.ret, align 16) + B %bb.1 + +...