Skip to content

Commit b8e30d6

Browse files
committedNov 22, 2016
[PowerPC] Emit VMX loads/stores for aligned ops to avoid adding swaps on LE
This patch corresponds to review: https://reviews.llvm.org/D26861 It also fixes PR30730. Committing on behalf of Lei Huang. llvm-svn: 287679
1 parent d1aed9a commit b8e30d6

File tree

7 files changed

+106
-65
lines changed

7 files changed

+106
-65
lines changed
 

‎llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -10814,6 +10814,14 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
1081410814
}
1081510815

1081610816
MVT VecTy = N->getValueType(0).getSimpleVT();
10817+
10818+
// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
10819+
// aligned and the type is a vector with elements up to 4 bytes
10820+
if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
10821+
&& VecTy.getScalarSizeInBits() <= 32 ) {
10822+
return SDValue();
10823+
}
10824+
1081710825
SDValue LoadOps[] = { Chain, Base };
1081810826
SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
1081910827
DAG.getVTList(MVT::v2f64, MVT::Other),
@@ -10878,6 +10886,13 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
1087810886
SDValue Src = N->getOperand(SrcOpnd);
1087910887
MVT VecTy = Src.getValueType().getSimpleVT();
1088010888

10889+
// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
10890+
// aligned and the type is a vector with elements up to 4 bytes
10891+
if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
10892+
&& VecTy.getScalarSizeInBits() <= 32 ) {
10893+
return SDValue();
10894+
}
10895+
1088110896
// All stores are done as v2f64 and possible bit cast.
1088210897
if (VecTy != MVT::v2f64) {
1088310898
Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);

‎llvm/lib/Target/PowerPC/PPCInstrVSX.td

+8-6
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ let Uses = [RM] in {
138138
def LXVW4X : XX1Form<31, 780,
139139
(outs vsrc:$XT), (ins memrr:$src),
140140
"lxvw4x $XT, $src", IIC_LdStLFD,
141-
[(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
141+
[]>;
142142
} // mayLoad
143143

144144
// Store indexed instructions
@@ -160,7 +160,7 @@ let Uses = [RM] in {
160160
def STXVW4X : XX1Form<31, 908,
161161
(outs), (ins vsrc:$XT, memrr:$dst),
162162
"stxvw4x $XT, $dst", IIC_LdStSTFD,
163-
[(store v4i32:$XT, xoaddr:$dst)]>;
163+
[]>;
164164
}
165165
} // mayStore
166166

@@ -1018,8 +1018,6 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
10181018
// Stores.
10191019
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
10201020
(STXVD2X $rS, xoaddr:$dst)>;
1021-
def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
1022-
(STXVW4X $rS, xoaddr:$dst)>;
10231021
def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
10241022
(STXVD2X $rS, xoaddr:$dst)>;
10251023
def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
@@ -1030,8 +1028,12 @@ let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
10301028
def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
10311029
def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
10321030
def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
1031+
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
10331032
def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
10341033
def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
1034+
def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
1035+
def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
1036+
(STXVW4X $rS, xoaddr:$dst)>;
10351037
}
10361038

10371039
// Permutes.
@@ -1852,8 +1854,8 @@ let Predicates = [IsLittleEndian, HasVSX] in
18521854
def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
18531855
(f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
18541856

1855-
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
1856-
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
1857+
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
1858+
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
18571859

18581860
let Predicates = [IsLittleEndian, HasDirectMove] in {
18591861
// v16i8 scalar <-> vector conversions (LE)

‎llvm/test/CodeGen/PowerPC/ppc64-i128-abi.ll

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
2-
; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE
2+
; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE \
3+
; RUN: --implicit-check-not xxswapd
34

45
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
56
; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
@@ -8,13 +9,15 @@
89
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
910

1011
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
11-
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
12+
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX \
13+
; RUN: --implicit-check-not xxswapd
1214

1315
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
1416
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-BE-NOVSX
1517

1618
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
17-
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-LE-NOVSX
19+
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | \
20+
; RUN: FileCheck %s -check-prefix=CHECK-LE-NOVSX --implicit-check-not xxswapd
1821

1922
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
2023
; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr < %s | FileCheck %s \
@@ -26,7 +29,7 @@
2629

2730
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
2831
; RUN: -mcpu=pwr9 -mattr=-power9-vector -mattr=-direct-move < %s | \
29-
; RUN: FileCheck %s -check-prefix=CHECK-LE
32+
; RUN: FileCheck %s -check-prefix=CHECK-LE --implicit-check-not xxswapd
3033

3134
@x = common global <1 x i128> zeroinitializer, align 16
3235
@y = common global <1 x i128> zeroinitializer, align 16
@@ -199,8 +202,7 @@ define <1 x i128> @call_v1i128_increment_by_one() nounwind {
199202
ret <1 x i128> %ret
200203

201204
; CHECK-LE-LABEL: @call_v1i128_increment_by_one
202-
; CHECK-LE: lxvd2x [[PARAM:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
203-
; CHECK-LE: xxswapd 34, [[PARAM]]
205+
; CHECK-LE: lvx 2, {{[0-9]+}}, {{[0-9]+}}
204206
; CHECK-LE: bl v1i128_increment_by_one
205207
; CHECK-LE: blr
206208

@@ -229,10 +231,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind {
229231
ret <1 x i128> %ret
230232

231233
; CHECK-LE-LABEL: @call_v1i128_increment_by_val
232-
; CHECK-LE: lxvd2x [[PARAM1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
233-
; CHECK-LE: lxvd2x [[PARAM2:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
234-
; CHECK-LE-DAG: xxswapd 34, [[PARAM1]]
235-
; CHECK-LE-DAG: xxswapd 35, [[PARAM2]]
234+
; CHECK-LE: lvx 2, {{[0-9]+}}, {{[0-9]+}}
235+
; CHECK-LE: lvx 3, {{[0-9]+}}, {{[0-9]+}}
236236
; CHECK-LE: bl v1i128_increment_by_val
237237
; CHECK-LE: blr
238238

‎llvm/test/CodeGen/PowerPC/swaps-le-1.ll

+26-20
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s \
1414
; RUN: | FileCheck -check-prefix=NOOPTSWAP %s
1515

16+
; LH: 2016-11-17
17+
; Updated align attritue from 16 to 8 to keep swap instructions tests.
18+
; Changes have been made on little-endian to use lvx and stvx
19+
; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for
20+
; aligned vectors with elements up to 4 bytes
21+
1622
; This test was generated from the following source:
1723
;
1824
; #define N 4096
@@ -29,10 +35,10 @@
2935
; }
3036
; }
3137

32-
@cb = common global [4096 x i32] zeroinitializer, align 16
33-
@cc = common global [4096 x i32] zeroinitializer, align 16
34-
@cd = common global [4096 x i32] zeroinitializer, align 16
35-
@ca = common global [4096 x i32] zeroinitializer, align 16
38+
@cb = common global [4096 x i32] zeroinitializer, align 8
39+
@cc = common global [4096 x i32] zeroinitializer, align 8
40+
@cd = common global [4096 x i32] zeroinitializer, align 8
41+
@ca = common global [4096 x i32] zeroinitializer, align 8
3642

3743
define void @foo() {
3844
entry:
@@ -42,63 +48,63 @@ vector.body:
4248
%index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ]
4349
%0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index
4450
%1 = bitcast i32* %0 to <4 x i32>*
45-
%wide.load = load <4 x i32>, <4 x i32>* %1, align 16
51+
%wide.load = load <4 x i32>, <4 x i32>* %1, align 8
4652
%2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index
4753
%3 = bitcast i32* %2 to <4 x i32>*
48-
%wide.load13 = load <4 x i32>, <4 x i32>* %3, align 16
54+
%wide.load13 = load <4 x i32>, <4 x i32>* %3, align 8
4955
%4 = add nsw <4 x i32> %wide.load13, %wide.load
5056
%5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index
5157
%6 = bitcast i32* %5 to <4 x i32>*
52-
%wide.load14 = load <4 x i32>, <4 x i32>* %6, align 16
58+
%wide.load14 = load <4 x i32>, <4 x i32>* %6, align 8
5359
%7 = mul nsw <4 x i32> %4, %wide.load14
5460
%8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index
5561
%9 = bitcast i32* %8 to <4 x i32>*
56-
store <4 x i32> %7, <4 x i32>* %9, align 16
62+
store <4 x i32> %7, <4 x i32>* %9, align 8
5763
%index.next = add nuw nsw i64 %index, 4
5864
%10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next
5965
%11 = bitcast i32* %10 to <4 x i32>*
60-
%wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 16
66+
%wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 8
6167
%12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next
6268
%13 = bitcast i32* %12 to <4 x i32>*
63-
%wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 16
69+
%wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 8
6470
%14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1
6571
%15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next
6672
%16 = bitcast i32* %15 to <4 x i32>*
67-
%wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 16
73+
%wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 8
6874
%17 = mul nsw <4 x i32> %14, %wide.load14.1
6975
%18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next
7076
%19 = bitcast i32* %18 to <4 x i32>*
71-
store <4 x i32> %17, <4 x i32>* %19, align 16
77+
store <4 x i32> %17, <4 x i32>* %19, align 8
7278
%index.next.1 = add nuw nsw i64 %index.next, 4
7379
%20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1
7480
%21 = bitcast i32* %20 to <4 x i32>*
75-
%wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 16
81+
%wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 8
7682
%22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1
7783
%23 = bitcast i32* %22 to <4 x i32>*
78-
%wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 16
84+
%wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 8
7985
%24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2
8086
%25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1
8187
%26 = bitcast i32* %25 to <4 x i32>*
82-
%wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 16
88+
%wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 8
8389
%27 = mul nsw <4 x i32> %24, %wide.load14.2
8490
%28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1
8591
%29 = bitcast i32* %28 to <4 x i32>*
86-
store <4 x i32> %27, <4 x i32>* %29, align 16
92+
store <4 x i32> %27, <4 x i32>* %29, align 8
8793
%index.next.2 = add nuw nsw i64 %index.next.1, 4
8894
%30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2
8995
%31 = bitcast i32* %30 to <4 x i32>*
90-
%wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 16
96+
%wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 8
9197
%32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2
9298
%33 = bitcast i32* %32 to <4 x i32>*
93-
%wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 16
99+
%wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 8
94100
%34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3
95101
%35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2
96102
%36 = bitcast i32* %35 to <4 x i32>*
97-
%wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 16
103+
%wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 8
98104
%37 = mul nsw <4 x i32> %34, %wide.load14.3
99105
%38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2
100106
%39 = bitcast i32* %38 to <4 x i32>*
101-
store <4 x i32> %37, <4 x i32>* %39, align 16
107+
store <4 x i32> %37, <4 x i32>* %39, align 8
102108
%index.next.3 = add nuw nsw i64 %index.next.2, 4
103109
%40 = icmp eq i64 %index.next.3, 4096
104110
br i1 %40, label %for.end, label %vector.body

‎llvm/test/CodeGen/PowerPC/swaps-le-2.ll

+19-12
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
; Test swap removal when a vector splat must be adjusted to make it legal.
44
;
5+
6+
; LH: 2016-11-17
7+
; Updated align attritue from 16 to 8 to keep swap instructions tests.
8+
; Changes have been made on little-endian to use lvx and stvx
9+
; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for
10+
; aligned vectors with elements up to 4 bytes
11+
512
; Test generated from following C code:
613
;
714
; vector char vc = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -28,37 +35,37 @@
2835
; vir = (vector int){vi[1], vi[1], vi[1], vi[1]};
2936
; }
3037

31-
@vc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
32-
@vs = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
33-
@vi = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
34-
@vcr = common global <16 x i8> zeroinitializer, align 16
35-
@vsr = common global <8 x i16> zeroinitializer, align 16
36-
@vir = common global <4 x i32> zeroinitializer, align 16
38+
@vc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 8
39+
@vs = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 8
40+
@vi = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 8
41+
@vcr = common global <16 x i8> zeroinitializer, align 8
42+
@vsr = common global <8 x i16> zeroinitializer, align 8
43+
@vir = common global <4 x i32> zeroinitializer, align 8
3744

3845
; Function Attrs: nounwind
3946
define void @cfoo() {
4047
entry:
41-
%0 = load <16 x i8>, <16 x i8>* @vc, align 16
48+
%0 = load <16 x i8>, <16 x i8>* @vc, align 8
4249
%vecinit30 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
43-
store <16 x i8> %vecinit30, <16 x i8>* @vcr, align 16
50+
store <16 x i8> %vecinit30, <16 x i8>* @vcr, align 8
4451
ret void
4552
}
4653

4754
; Function Attrs: nounwind
4855
define void @sfoo() {
4956
entry:
50-
%0 = load <8 x i16>, <8 x i16>* @vs, align 16
57+
%0 = load <8 x i16>, <8 x i16>* @vs, align 8
5158
%vecinit14 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
52-
store <8 x i16> %vecinit14, <8 x i16>* @vsr, align 16
59+
store <8 x i16> %vecinit14, <8 x i16>* @vsr, align 8
5360
ret void
5461
}
5562

5663
; Function Attrs: nounwind
5764
define void @ifoo() {
5865
entry:
59-
%0 = load <4 x i32>, <4 x i32>* @vi, align 16
66+
%0 = load <4 x i32>, <4 x i32>* @vi, align 8
6067
%vecinit6 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
61-
store <4 x i32> %vecinit6, <4 x i32>* @vir, align 16
68+
store <4 x i32> %vecinit6, <4 x i32>* @vir, align 8
6269
ret void
6370
}
6471

‎llvm/test/CodeGen/PowerPC/vsx-ldst.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414

1515
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mattr=+vsx -O2 \
1616
; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
17-
; RUN: grep lxvd2x < %t | count 6
18-
; RUN: grep stxvd2x < %t | count 6
17+
; RUN: grep lxvd2x < %t | count 3
18+
; RUN: grep lvx < %t | count 3
19+
; RUN: grep stxvd2x < %t | count 3
20+
; RUN: grep stvx < %t | count 3
1921

2022
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O2 \
2123
; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s > %t

‎llvm/test/CodeGen/PowerPC/vsx.ll

+24-15
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1-
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | FileCheck %s
2-
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-REG %s
3-
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx -fast-isel -O0 < %s | FileCheck %s
4-
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx -fast-isel -O0 < %s | FileCheck -check-prefix=CHECK-FISL %s
5-
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-LE %s
1+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 \
2+
; RUN: -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | FileCheck %s
3+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 \
4+
; RUN: -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | \
5+
; RUN: FileCheck -check-prefix=CHECK-REG %s
6+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 \
7+
; RUN: -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx -fast-isel -O0 < %s |\
8+
; RUN: FileCheck %s
9+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 \
10+
; RUN: -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx -fast-isel -O0 < %s |\
11+
; RUN: FileCheck -check-prefix=CHECK-FISL %s
12+
; RUN: llc -verify-machineinstrs -mcpu=pwr8 \
13+
; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mattr=+vsx < %s | \
14+
; RUN: FileCheck -check-prefix=CHECK-LE %s
615

716
define double @test1(double %a, double %b) {
817
entry:
@@ -645,8 +654,8 @@ define <4 x float> @test32(<4 x float>* %a) {
645654
; CHECK-FISL: blr
646655

647656
; CHECK-LE-LABEL: @test32
648-
; CHECK-LE: lxvd2x [[V1:[0-9]+]], 0, 3
649-
; CHECK-LE: xxswapd 34, [[V1]]
657+
; CHECK-LE: lvx 2, 0, 3
658+
; CHECK-LE-NOT: xxswapd
650659
; CHECK-LE: blr
651660
}
652661

@@ -663,8 +672,8 @@ define void @test33(<4 x float>* %a, <4 x float> %b) {
663672
; CHECK-FISL: blr
664673

665674
; CHECK-LE-LABEL: @test33
666-
; CHECK-LE: xxswapd [[V1:[0-9]+]], 34
667-
; CHECK-LE: stxvd2x [[V1]], 0, 3
675+
; CHECK-LE-NOT: xxswapd
676+
; CHECK-LE: stvx 2, 0, 3
668677
; CHECK-LE: blr
669678
}
670679

@@ -716,8 +725,8 @@ define <4 x i32> @test34(<4 x i32>* %a) {
716725
; CHECK-FISL: blr
717726

718727
; CHECK-LE-LABEL: @test34
719-
; CHECK-LE: lxvd2x [[V1:[0-9]+]], 0, 3
720-
; CHECK-LE: xxswapd 34, [[V1]]
728+
; CHECK-LE: lvx 2, 0, 3
729+
; CHECK-LE-NOT: xxswapd
721730
; CHECK-LE: blr
722731
}
723732

@@ -734,8 +743,8 @@ define void @test35(<4 x i32>* %a, <4 x i32> %b) {
734743
; CHECK-FISL: blr
735744

736745
; CHECK-LE-LABEL: @test35
737-
; CHECK-LE: xxswapd [[V1:[0-9]+]], 34
738-
; CHECK-LE: stxvd2x [[V1]], 0, 3
746+
; CHECK-LE-NOT: xxswapd
747+
; CHECK-LE: stvx 2, 0, 3
739748
; CHECK-LE: blr
740749
}
741750

@@ -1154,9 +1163,9 @@ define <2 x i32> @test80(i32 %v) {
11541163
; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
11551164
; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
11561165
; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
1157-
; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
1166+
; CHECK-LE-DAG: lvx 3, 0, [[R2]]
11581167
; CHECK-LE-DAG: xxspltw 34, [[V1]]
1159-
; CHECK-LE-DAG: xxswapd 35, [[V2]]
1168+
; CHECK-LE-NOT: xxswapd 35, [[V2]]
11601169
; CHECK-LE: vadduwm 2, 2, 3
11611170
; CHECK-LE: blr
11621171
}

0 commit comments

Comments
 (0)
Please sign in to comment.