diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -2563,6 +2563,11 @@ (STXVRDX $src, xoaddr:$dst)>; def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst), (STXVRDX $src, xoaddr:$dst)>; + // Load element 0 of a VSX register to memory + def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 xoaddr:$src)))), + (v8i16 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VSRC))>; + def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 xoaddr:$src)))), + (v16i8 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VSRC))>; } // FIXME: The swap is overkill when the shift amount is a constant. diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -152,6 +152,7 @@ def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">; def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">; def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; +def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; //--------------------- VSX-specific instruction formats ---------------------// // By default, all VSX instructions are to be selected over their Altivec @@ -2437,6 +2438,8 @@ // [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian] // [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] // [HasVSX, HasP9Vector] +// [HasVSX, HasP9Vector, NoP10Vector] +// [HasVSX, HasP9Vector, IsBigEndian] // [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP9Vector, IsLittleEndian] // [HasVSX, HasP9Altivec] @@ -3735,9 +3738,6 @@ (STXVX $rS, xoaddr:$dst)>; // Build vectors from i8 loads -defm : ScalToVecWPermute; defm : ScalToVecWPermute; @@ -3755,9 +3755,6 @@ (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>; // Build vectors from i16 loads -defm : ScalToVecWPermute; defm : ScalToVecWPermute; @@ -3955,6 +3952,38 @@ (v4i32 (LXVWSX xoaddr:$A))>; } // HasVSX, HasP9Vector +// Any Power9 VSX subtarget with equivalent length but better Power10 VSX +// patterns. +// Two identical blocks are required due to the slightly different predicates: +// One without P10 instructions, the other is BigEndian only with P10 instructions. +let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in { +// Little endian Power10 subtargets produce a shorter pattern but require a +// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions +// to perform the operation, when only one instruction is produced in practice. +// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets. +defm : ScalToVecWPermute; +// Build vectors from i16 loads +defm : ScalToVecWPermute; +} // HasVSX, HasP9Vector, NoP10Vector + +// Any big endian Power9 VSX subtarget +let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in { +// Power10 VSX subtargets produce a shorter pattern for little endian targets +// but this is still the best pattern for Power9 and Power10 VSX big endian +// Build vectors from i8 loads +defm : ScalToVecWPermute; +// Build vectors from i16 loads +defm : ScalToVecWPermute; +} // HasVSX, HasP9Vector, NoP10Vector + // Big endian 64Bit Power9 subtarget. let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in { def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), diff --git a/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll b/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-P10LE + +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-P10BE + +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-P9 + +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-P9 + +define <8 x i16> @test1(i16* %a) { +; CHECK-P10LE-LABEL: test1: +; CHECK-P10LE: # %bb.0: # %entry +; CHECK-P10LE-NEXT: lxvrhx v2, 0, r3 +; CHECK-P10LE-NEXT: blr +; +; CHECK-P10BE-LABEL: test1: +; CHECK-P10BE: # %bb.0: # %entry +; CHECK-P10BE-NEXT: lxsihzx v2, 0, r3 +; CHECK-P10BE-NEXT: vsplth v2, v2, 3 +; CHECK-P10BE-NEXT: blr +; +; CHECK-P9-LABEL: test1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-P9-NEXT: vsplth v2, v2, 3 +; CHECK-P9-NEXT: blr +entry: + %0 = load i16, i16* %a, align 2 + %vecinit = insertelement <8 x i16> undef, i16 %0, i32 0 + ret <8 x i16> %vecinit +} + +define <16 x i8> @test2(i8* %a) { +; CHECK-P10LE-LABEL: test2: +; CHECK-P10LE: # %bb.0: # %entry +; CHECK-P10LE-NEXT: lxvrbx v2, 0, r3 +; CHECK-P10LE-NEXT: blr +; +; CHECK-P10BE-LABEL: test2: +; CHECK-P10BE: # %bb.0: # %entry +; CHECK-P10BE-NEXT: lxsibzx v2, 0, r3 +; CHECK-P10BE-NEXT: vspltb v2, v2, 7 +; CHECK-P10BE-NEXT: blr +; +; CHECK-P9-LABEL: test2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-P9-NEXT: vspltb v2, v2, 7 +; CHECK-P9-NEXT: blr +entry: + %0 = load i8, i8* %a, align 1 + %vecins = insertelement <16 x i8> undef, i8 %0, i32 0 + ret <16 x i8> %vecins +} +