diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -129,7 +129,7 @@ static cl::opt DisablePerfectShuffle("ppc-disable-perfect-shuffle", cl::desc("disable vector permute decomposition"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); diff --git a/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll b/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll --- a/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll +++ b/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll @@ -1,5 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | grep vsldoi -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vor +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | grep vsldoi +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vor + +; TODO: Fix this case when disabling perfect shuffle define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) { %tmp76 = shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll --- a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll +++ b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll @@ -1447,16 +1447,16 @@ define <4 x float> @testSameVecEl0LE(<4 x float> %a) { ; CHECK-64-LABEL: testSameVecEl0LE: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xxspltw 0, 34, 2 -; CHECK-64-NEXT: xxsldwi 0, 34, 0, 1 -; CHECK-64-NEXT: xxsldwi 34, 0, 0, 3 +; CHECK-64-NEXT: ld 3, L..C0(2) # %const.0 +; CHECK-64-NEXT: lxv 35, 0(3) +; CHECK-64-NEXT: vperm 2, 2, 2, 3 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testSameVecEl0LE: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: xxspltw 0, 34, 2 -; CHECK-32-NEXT: xxsldwi 0, 34, 0, 1 -; CHECK-32-NEXT: xxsldwi 34, 0, 0, 3 +; CHECK-32-NEXT: lwz 3, L..C0(2) # %const.0 +; CHECK-32-NEXT: lxv 35, 0(3) +; CHECK-32-NEXT: vperm 2, 2, 2, 3 ; CHECK-32-NEXT: blr entry: %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> @@ -1465,16 +1465,16 @@ define <4 x float> @testSameVecEl1LE(<4 x float> %a) { ; CHECK-64-LABEL: testSameVecEl1LE: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xxswapd 0, 34 -; CHECK-64-NEXT: xxmrghw 1, 34, 0 -; CHECK-64-NEXT: xxmrghw 34, 1, 0 +; CHECK-64-NEXT: ld 3, L..C1(2) # %const.0 +; CHECK-64-NEXT: lxv 35, 0(3) +; CHECK-64-NEXT: vperm 2, 2, 2, 3 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testSameVecEl1LE: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: xxswapd 0, 34 -; CHECK-32-NEXT: xxmrghw 1, 34, 0 -; CHECK-32-NEXT: xxmrghw 34, 1, 0 +; CHECK-32-NEXT: lwz 3, L..C1(2) # %const.0 +; CHECK-32-NEXT: lxv 35, 0(3) +; CHECK-32-NEXT: vperm 2, 2, 2, 3 ; CHECK-32-NEXT: blr entry: %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> @@ -1483,16 +1483,16 @@ define <4 x float> @testSameVecEl3LE(<4 x float> %a) { ; CHECK-64-LABEL: testSameVecEl3LE: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xxspltw 0, 34, 2 -; CHECK-64-NEXT: xxswapd 1, 34 -; CHECK-64-NEXT: xxsldwi 34, 1, 0, 2 +; CHECK-64-NEXT: ld 3, L..C2(2) # %const.0 +; CHECK-64-NEXT: lxv 35, 0(3) +; CHECK-64-NEXT: vperm 2, 2, 2, 3 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testSameVecEl3LE: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: xxspltw 0, 34, 2 -; CHECK-32-NEXT: xxswapd 1, 34 -; CHECK-32-NEXT: xxsldwi 34, 1, 0, 2 +; CHECK-32-NEXT: lwz 3, L..C2(2) # %const.0 +; CHECK-32-NEXT: lxv 35, 0(3) +; CHECK-32-NEXT: vperm 2, 2, 2, 3 ; CHECK-32-NEXT: blr entry: %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll --- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll +++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll @@ -30,12 +30,13 @@ ; CHECK-AIX-NEXT: lxvw4x 35, 0, 3 ; CHECK-AIX-NEXT: addi 3, 1, -16 ; CHECK-AIX-NEXT: lxvw4x 36, 0, 3 +; CHECK-AIX-NEXT: ld 3, L..C0(2) # %const.0 ; CHECK-AIX-NEXT: vmrghh 3, 2, 3 -; CHECK-AIX-NEXT: vsplth 5, 2, 0 -; CHECK-AIX-NEXT: vmrghh 2, 4, 2 -; CHECK-AIX-NEXT: xxmrghw 35, 35, 37 -; CHECK-AIX-NEXT: xxswapd 0, 35 -; CHECK-AIX-NEXT: xxsldwi 34, 0, 34, 2 +; CHECK-AIX-NEXT: vmrghh 4, 4, 2 +; CHECK-AIX-NEXT: vsplth 2, 2, 0 +; CHECK-AIX-NEXT: xxmrghw 34, 35, 34 +; CHECK-AIX-NEXT: lxvw4x 35, 0, 3 +; CHECK-AIX-NEXT: vperm 2, 2, 4, 3 ; CHECK-AIX-NEXT: vsplth 3, 2, 1 ; CHECK-AIX-NEXT: vsplth 2, 2, 4 ; CHECK-AIX-NEXT: stxvw4x 35, 0, 5 diff --git a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll --- a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll @@ -55,15 +55,15 @@ define <2 x i64> @buildl(i64 %a) { ; CHECK-LABEL: buildl: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz 5, L..C0(2) # %const.0 ; CHECK-NEXT: stw 4, -16(1) ; CHECK-NEXT: stw 3, -32(1) ; CHECK-NEXT: addi 3, 1, -16 ; CHECK-NEXT: addi 4, 1, -32 -; CHECK-NEXT: lxvw4x 0, 0, 3 -; CHECK-NEXT: lxvw4x 1, 0, 4 -; CHECK-NEXT: xxmrghw 34, 1, 0 -; CHECK-NEXT: xxswapd 0, 34 -; CHECK-NEXT: xxsldwi 34, 0, 34, 2 +; CHECK-NEXT: lxvw4x 35, 0, 3 +; CHECK-NEXT: lxvw4x 36, 0, 4 +; CHECK-NEXT: lxvw4x 34, 0, 5 +; CHECK-NEXT: vperm 2, 4, 3, 2 ; CHECK-NEXT: blr entry: %splat.splatinsert = insertelement <2 x i64> undef, i64 %a, i32 0 @@ -90,7 +90,7 @@ define <2 x double> @buildd() { ; CHECK-LABEL: buildd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lwz 3, L..C0(2) # @d +; CHECK-NEXT: lwz 3, L..C1(2) # @d ; CHECK-NEXT: lxvdsx 34, 0, 3 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll --- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -584,14 +584,16 @@ ; ; CHECK-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-BE-NEXT: li r3, 16 +; CHECK-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha +; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-BE-NEXT: li r4, 20 +; CHECK-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 +; CHECK-BE-NEXT: li r3, 16 ; CHECK-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 -; CHECK-BE-NEXT: stfiwx f1, r5, r4 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r5 +; CHECK-BE-NEXT: stfiwx f0, r5, r4 +; CHECK-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-BE-NEXT: stxvw4x vs35, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_stores_exceed_vec_size: @@ -610,14 +612,16 @@ ; ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs35, 0(r3) ; CHECK-P9-BE-NEXT: li r3, 16 ; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 ; CHECK-P9-BE-NEXT: li r3, 20 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 -; CHECK-P9-BE-NEXT: stxv vs0, 0(r5) -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-BE-NEXT: stxv vs35, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 2 diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -211,45 +211,45 @@ ; P9-AIX32-NEXT: lwz r5, 24(r4) ; P9-AIX32-NEXT: lwz r4, 28(r4) ; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0 ; P9-AIX32-NEXT: stw r5, -32(r1) -; P9-AIX32-NEXT: lxv vs0, -16(r1) -; P9-AIX32-NEXT: lxv vs1, -32(r1) -; P9-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P9-AIX32-NEXT: xxswapd vs0, v2 -; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: lxv v3, -16(r1) +; P9-AIX32-NEXT: lxv v4, -32(r1) +; P9-AIX32-NEXT: lxv v2, 0(r4) +; P9-AIX32-NEXT: vperm v2, v4, v3, v2 +; P9-AIX32-NEXT: stxv v2, 0(r3) ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test4: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r5, 24(r4) -; P8-AIX32-NEXT: lwz r4, 28(r4) -; P8-AIX32-NEXT: stw r4, -16(r1) -; P8-AIX32-NEXT: stw r5, -32(r1) +; P8-AIX32-NEXT: lwz r5, L..C0(r2) # %const.0 +; P8-AIX32-NEXT: lwz r6, 28(r4) +; P8-AIX32-NEXT: lwz r4, 24(r4) +; P8-AIX32-NEXT: stw r6, -16(r1) +; P8-AIX32-NEXT: stw r4, -32(r1) ; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: lxvw4x v2, 0, r5 ; P8-AIX32-NEXT: addi r5, r1, -32 -; P8-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P8-AIX32-NEXT: lxvw4x vs1, 0, r5 -; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P8-AIX32-NEXT: xxswapd vs0, v2 -; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: lxvw4x v3, 0, r4 +; P8-AIX32-NEXT: lxvw4x v4, 0, r5 +; P8-AIX32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX32-NEXT: stxvw4x v2, 0, r3 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test4: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lwz r6, 28(r4) -; P7-AIX32-NEXT: lwz r4, 24(r4) -; P7-AIX32-NEXT: addi r5, r1, -16 -; P7-AIX32-NEXT: stw r6, -16(r1) -; P7-AIX32-NEXT: stw r4, -32(r1) -; P7-AIX32-NEXT: addi r4, r1, -32 -; P7-AIX32-NEXT: lxvw4x vs0, 0, r5 -; P7-AIX32-NEXT: lxvw4x vs1, 0, r4 -; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P7-AIX32-NEXT: xxswapd vs0, v2 -; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: lwz r5, L..C0(r2) # %const.0 +; P7-AIX32-NEXT: lwz r6, 24(r4) +; P7-AIX32-NEXT: lwz r4, 28(r4) +; P7-AIX32-NEXT: stw r4, -16(r1) +; P7-AIX32-NEXT: stw r6, -32(r1) +; P7-AIX32-NEXT: lxvw4x v2, 0, r5 +; P7-AIX32-NEXT: addi r4, r1, -16 +; P7-AIX32-NEXT: addi r5, r1, -32 +; P7-AIX32-NEXT: lxvw4x v3, 0, r4 +; P7-AIX32-NEXT: lxvw4x v4, 0, r5 +; P7-AIX32-NEXT: vperm v2, v4, v3, v2 +; P7-AIX32-NEXT: stxvw4x v2, 0, r3 ; P7-AIX32-NEXT: blr entry: %arrayidx = getelementptr inbounds i64, i64* %a, i64 3 @@ -288,45 +288,45 @@ ; P9-AIX32-NEXT: lwz r4, 0(r4) ; P9-AIX32-NEXT: srawi r5, r4, 31 ; P9-AIX32-NEXT: stw r4, -16(r1) -; P9-AIX32-NEXT: lxv vs0, -16(r1) +; P9-AIX32-NEXT: lwz r4, L..C1(r2) # %const.0 +; P9-AIX32-NEXT: lxv v3, -16(r1) ; P9-AIX32-NEXT: stw r5, -32(r1) -; P9-AIX32-NEXT: lxv vs1, -32(r1) -; P9-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P9-AIX32-NEXT: xxswapd vs0, v2 -; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: lxv v4, -32(r1) +; P9-AIX32-NEXT: lxv v2, 0(r4) +; P9-AIX32-NEXT: vperm v2, v4, v3, v2 +; P9-AIX32-NEXT: stxv v2, 0(r3) ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test5: ; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r5, L..C1(r2) # %const.0 ; P8-AIX32-NEXT: lwz r4, 0(r4) -; P8-AIX32-NEXT: srawi r5, r4, 31 ; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: srawi r4, r4, 31 +; P8-AIX32-NEXT: stw r4, -32(r1) +; P8-AIX32-NEXT: lxvw4x v2, 0, r5 ; P8-AIX32-NEXT: addi r4, r1, -16 -; P8-AIX32-NEXT: stw r5, -32(r1) ; P8-AIX32-NEXT: addi r5, r1, -32 -; P8-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P8-AIX32-NEXT: lxvw4x vs1, 0, r5 -; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P8-AIX32-NEXT: xxswapd vs0, v2 -; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: lxvw4x v3, 0, r4 +; P8-AIX32-NEXT: lxvw4x v4, 0, r5 +; P8-AIX32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX32-NEXT: stxvw4x v2, 0, r3 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test5: ; P7-AIX32: # %bb.0: # %entry ; P7-AIX32-NEXT: lwz r4, 0(r4) -; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: lwz r5, L..C1(r2) # %const.0 +; P7-AIX32-NEXT: srawi r6, r4, 31 ; P7-AIX32-NEXT: stw r4, -16(r1) -; P7-AIX32-NEXT: srawi r4, r4, 31 -; P7-AIX32-NEXT: stw r4, -32(r1) -; P7-AIX32-NEXT: addi r4, r1, -32 -; P7-AIX32-NEXT: lxvw4x vs0, 0, r5 -; P7-AIX32-NEXT: lxvw4x vs1, 0, r4 -; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P7-AIX32-NEXT: xxswapd vs0, v2 -; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: addi r4, r1, -16 +; P7-AIX32-NEXT: stw r6, -32(r1) +; P7-AIX32-NEXT: lxvw4x v2, 0, r5 +; P7-AIX32-NEXT: addi r5, r1, -32 +; P7-AIX32-NEXT: lxvw4x v3, 0, r4 +; P7-AIX32-NEXT: lxvw4x v4, 0, r5 +; P7-AIX32-NEXT: vperm v2, v4, v3, v2 +; P7-AIX32-NEXT: stxvw4x v2, 0, r3 ; P7-AIX32-NEXT: blr entry: %0 = load i32, i32* %in, align 4 @@ -365,45 +365,45 @@ ; P9-AIX32-NEXT: lwz r4, 0(r4) ; P9-AIX32-NEXT: li r5, 0 ; P9-AIX32-NEXT: stw r5, -32(r1) -; P9-AIX32-NEXT: lxv vs0, -32(r1) +; P9-AIX32-NEXT: lxv v3, -32(r1) ; P9-AIX32-NEXT: stw r4, -16(r1) -; P9-AIX32-NEXT: lxv vs1, -16(r1) -; P9-AIX32-NEXT: xxmrghw v2, vs0, vs1 -; P9-AIX32-NEXT: xxswapd vs0, v2 -; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 +; P9-AIX32-NEXT: lxv v4, -16(r1) +; P9-AIX32-NEXT: lxv v2, 0(r4) +; P9-AIX32-NEXT: vperm v2, v3, v4, v2 +; P9-AIX32-NEXT: stxv v2, 0(r3) ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test6: ; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r6, L..C2(r2) # %const.0 ; P8-AIX32-NEXT: lwz r4, 0(r4) ; P8-AIX32-NEXT: li r5, 0 ; P8-AIX32-NEXT: stw r5, -32(r1) ; P8-AIX32-NEXT: addi r5, r1, -16 ; P8-AIX32-NEXT: stw r4, -16(r1) ; P8-AIX32-NEXT: addi r4, r1, -32 -; P8-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P8-AIX32-NEXT: lxvw4x vs1, 0, r5 -; P8-AIX32-NEXT: xxmrghw v2, vs0, vs1 -; P8-AIX32-NEXT: xxswapd vs0, v2 -; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: lxvw4x v2, 0, r6 +; P8-AIX32-NEXT: lxvw4x v3, 0, r4 +; P8-AIX32-NEXT: lxvw4x v4, 0, r5 +; P8-AIX32-NEXT: vperm v2, v3, v4, v2 +; P8-AIX32-NEXT: stxvw4x v2, 0, r3 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test6: ; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r5, L..C2(r2) # %const.0 ; P7-AIX32-NEXT: lwz r4, 0(r4) -; P7-AIX32-NEXT: li r5, 0 -; P7-AIX32-NEXT: stw r5, -32(r1) -; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: li r6, 0 +; P7-AIX32-NEXT: stw r6, -32(r1) ; P7-AIX32-NEXT: stw r4, -16(r1) ; P7-AIX32-NEXT: addi r4, r1, -32 -; P7-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P7-AIX32-NEXT: lxvw4x vs1, 0, r5 -; P7-AIX32-NEXT: xxmrghw v2, vs0, vs1 -; P7-AIX32-NEXT: xxswapd vs0, v2 -; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 -; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: lxvw4x v2, 0, r5 +; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: lxvw4x v3, 0, r4 +; P7-AIX32-NEXT: lxvw4x v4, 0, r5 +; P7-AIX32-NEXT: vperm v2, v3, v4, v2 +; P7-AIX32-NEXT: stxvw4x v2, 0, r3 ; P7-AIX32-NEXT: blr entry: %0 = load i32, i32* %in, align 4 @@ -832,32 +832,34 @@ ; ; P8-AIX32-LABEL: unadjusted_lxvdsx: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r4, 4(r3) -; P8-AIX32-NEXT: stw r4, -32(r1) -; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: lwz r5, 4(r3) +; P8-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0 +; P8-AIX32-NEXT: stw r5, -32(r1) ; P8-AIX32-NEXT: lwz r3, 0(r3) +; P8-AIX32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX32-NEXT: addi r4, r1, -16 ; P8-AIX32-NEXT: stw r3, -16(r1) ; P8-AIX32-NEXT: addi r3, r1, -32 -; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 -; P8-AIX32-NEXT: lxvw4x vs1, 0, r4 -; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P8-AIX32-NEXT: xxsldwi vs0, vs1, v2, 2 -; P8-AIX32-NEXT: xxmrgld v2, vs0, vs0 +; P8-AIX32-NEXT: lxvw4x v3, 0, r3 +; P8-AIX32-NEXT: lxvw4x v4, 0, r4 +; P8-AIX32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX32-NEXT: xxmrghd v2, v2, v2 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: unadjusted_lxvdsx: ; P7-AIX32: # %bb.0: # %entry ; P7-AIX32-NEXT: lwz r5, 4(r3) -; P7-AIX32-NEXT: addi r4, r1, -32 +; P7-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0 ; P7-AIX32-NEXT: stw r5, -32(r1) ; P7-AIX32-NEXT: lwz r3, 0(r3) +; P7-AIX32-NEXT: lxvw4x v2, 0, r4 +; P7-AIX32-NEXT: addi r4, r1, -16 ; P7-AIX32-NEXT: stw r3, -16(r1) -; P7-AIX32-NEXT: addi r3, r1, -16 -; P7-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P7-AIX32-NEXT: lxvw4x vs1, 0, r3 -; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0 -; P7-AIX32-NEXT: xxsldwi vs0, vs1, v2, 2 -; P7-AIX32-NEXT: xxmrgld v2, vs0, vs0 +; P7-AIX32-NEXT: addi r3, r1, -32 +; P7-AIX32-NEXT: lxvw4x v3, 0, r3 +; P7-AIX32-NEXT: lxvw4x v4, 0, r4 +; P7-AIX32-NEXT: vperm v2, v4, v3, v2 +; P7-AIX32-NEXT: xxmrghd v2, v2, v2 ; P7-AIX32-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* diff --git a/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll --- a/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll +++ b/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll @@ -1,4 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -ppc-disable-perfect-shuffle=false < %s | FileCheck %s + +; TODO: Fix this case when disabling perfect shuffle + target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll --- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll @@ -1,19 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=BE ; RUN: llc -mtriple powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=LE +; RUN: llc -mtriple powerpc64le -mcpu=pwr10 -ppc-disable-perfect-shuffle=false < %s | FileCheck %s --check-prefix=LE +; RUN: llc -mtriple powerpc64 -mcpu=pwr10 -ppc-disable-perfect-shuffle=false < %s | FileCheck %s --check-prefix=BE-ENABLE + +; TODO: Fix the worse codegen when disabling perfect shuffle define <4 x float> @shuffle1(<16 x i8> %v1, <16 x i8> %v2) { ; BE-LABEL: shuffle1: ; BE: # %bb.0: -; BE-NEXT: xxmrglw 0, 34, 35 -; BE-NEXT: xxmrghw 1, 34, 35 -; BE-NEXT: xxmrghw 34, 1, 0 +; BE-NEXT: addis 3, 2, .LCPI0_0@toc@ha +; BE-NEXT: addi 3, 3, .LCPI0_0@toc@l +; BE-NEXT: lxv 36, 0(3) +; BE-NEXT: vperm 2, 2, 3, 4 ; BE-NEXT: blr ; ; LE-LABEL: shuffle1: ; LE: # %bb.0: ; LE-NEXT: vpkudum 2, 3, 2 ; LE-NEXT: blr +; +; BE-ENABLE-LABEL: shuffle1: +; BE-ENABLE: # %bb.0: +; BE-ENABLE-NEXT: xxmrglw 0, 34, 35 +; BE-ENABLE-NEXT: xxmrghw 1, 34, 35 +; BE-ENABLE-NEXT: xxmrghw 34, 1, 0 +; BE-ENABLE-NEXT: blr %shuf = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> %cast = bitcast <16 x i8> %shuf to <4 x float> ret <4 x float> %cast @@ -30,6 +42,11 @@ ; LE-NEXT: plxv 36, .LCPI1_0@PCREL(0), 1 ; LE-NEXT: vperm 2, 3, 2, 4 ; LE-NEXT: blr +; +; BE-ENABLE-LABEL: shuffle2: +; BE-ENABLE: # %bb.0: +; BE-ENABLE-NEXT: vpkudum 2, 2, 3 +; BE-ENABLE-NEXT: blr %shuf = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> %cast = bitcast <16 x i8> %shuf to <4 x float> ret <4 x float> %cast @@ -38,12 +55,11 @@ define <4 x float> @shuffle3(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) { ; BE-LABEL: shuffle3: ; BE: # %bb.0: -; BE-NEXT: xxmrglw 0, 34, 35 -; BE-NEXT: xxmrghw 1, 34, 35 -; BE-NEXT: xxmrghw 34, 1, 0 -; BE-NEXT: xxmrglw 0, 36, 37 -; BE-NEXT: xxmrghw 1, 36, 37 -; BE-NEXT: xxmrghw 35, 1, 0 +; BE-NEXT: addis 3, 2, .LCPI2_0@toc@ha +; BE-NEXT: addi 3, 3, .LCPI2_0@toc@l +; BE-NEXT: lxv 32, 0(3) +; BE-NEXT: vperm 2, 2, 3, 0 +; BE-NEXT: vperm 3, 4, 5, 0 ; BE-NEXT: xvaddsp 34, 34, 35 ; BE-NEXT: blr ; @@ -53,6 +69,17 @@ ; LE-NEXT: vpkudum 3, 5, 4 ; LE-NEXT: xvaddsp 34, 34, 35 ; LE-NEXT: blr +; +; BE-ENABLE-LABEL: shuffle3: +; BE-ENABLE: # %bb.0: +; BE-ENABLE-NEXT: xxmrglw 0, 34, 35 +; BE-ENABLE-NEXT: xxmrghw 1, 34, 35 +; BE-ENABLE-NEXT: xxmrghw 34, 1, 0 +; BE-ENABLE-NEXT: xxmrglw 0, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 1, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 35, 1, 0 +; BE-ENABLE-NEXT: xvaddsp 34, 34, 35 +; BE-ENABLE-NEXT: blr %shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> %shuf2 = shufflevector <16 x i8> %v3, <16 x i8> %v4, <16 x i32> %cast1 = bitcast <16 x i8> %shuf1 to <4 x float> @@ -76,6 +103,13 @@ ; LE-NEXT: vperm 3, 5, 4, 0 ; LE-NEXT: xvaddsp 34, 34, 35 ; LE-NEXT: blr +; +; BE-ENABLE-LABEL: shuffle4: +; BE-ENABLE: # %bb.0: +; BE-ENABLE-NEXT: vpkudum 2, 2, 3 +; BE-ENABLE-NEXT: vpkudum 3, 4, 5 +; BE-ENABLE-NEXT: xvaddsp 34, 34, 35 +; BE-ENABLE-NEXT: blr %shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> %shuf2 = shufflevector <16 x i8> %v3, <16 x i8> %v4, <16 x i32> %cast1 = bitcast <16 x i8> %shuf1 to <4 x float> @@ -87,22 +121,21 @@ define <4 x float> @shuffle5(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) { ; BE-LABEL: shuffle5: ; BE: # %bb.0: # %entry -; BE-NEXT: xxmrglw 0, 34, 35 -; BE-NEXT: xxmrghw 1, 34, 35 +; BE-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; BE-NEXT: addi 3, 3, .LCPI4_0@toc@l +; BE-NEXT: lxv 32, 0(3) ; BE-NEXT: li 3, 8 ; BE-NEXT: vextublx 3, 3, 2 -; BE-NEXT: xxmrghw 0, 1, 0 ; BE-NEXT: andi. 3, 3, 255 -; BE-NEXT: xxlor 1, 0, 0 +; BE-NEXT: vperm 3, 2, 3, 0 +; BE-NEXT: vmr 2, 3 ; BE-NEXT: beq 0, .LBB4_2 ; BE-NEXT: # %bb.1: # %exit -; BE-NEXT: xvaddsp 34, 0, 1 +; BE-NEXT: xvaddsp 34, 35, 34 ; BE-NEXT: blr ; BE-NEXT: .LBB4_2: # %second -; BE-NEXT: xxmrglw 1, 36, 37 -; BE-NEXT: xxmrghw 2, 36, 37 -; BE-NEXT: xxmrghw 1, 2, 1 -; BE-NEXT: xvaddsp 34, 0, 1 +; BE-NEXT: vperm 2, 4, 5, 0 +; BE-NEXT: xvaddsp 34, 35, 34 ; BE-NEXT: blr ; ; LE-LABEL: shuffle5: @@ -120,6 +153,26 @@ ; LE-NEXT: vpkudum 2, 5, 4 ; LE-NEXT: xvaddsp 34, 35, 34 ; LE-NEXT: blr +; +; BE-ENABLE-LABEL: shuffle5: +; BE-ENABLE: # %bb.0: # %entry +; BE-ENABLE-NEXT: xxmrglw 0, 34, 35 +; BE-ENABLE-NEXT: xxmrghw 1, 34, 35 +; BE-ENABLE-NEXT: li 3, 8 +; BE-ENABLE-NEXT: vextublx 3, 3, 2 +; BE-ENABLE-NEXT: xxmrghw 0, 1, 0 +; BE-ENABLE-NEXT: andi. 3, 3, 255 +; BE-ENABLE-NEXT: xxlor 1, 0, 0 +; BE-ENABLE-NEXT: beq 0, .LBB4_2 +; BE-ENABLE-NEXT: # %bb.1: # %exit +; BE-ENABLE-NEXT: xvaddsp 34, 0, 1 +; BE-ENABLE-NEXT: blr +; BE-ENABLE-NEXT: .LBB4_2: # %second +; BE-ENABLE-NEXT: xxmrglw 1, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 2, 36, 37 +; BE-ENABLE-NEXT: xxmrghw 1, 2, 1 +; BE-ENABLE-NEXT: xvaddsp 34, 0, 1 +; BE-ENABLE-NEXT: blr entry: %shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> %fetch = extractelement <16 x i8> %shuf1, i32 4 diff --git a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll --- a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll @@ -10,26 +10,27 @@ ; 32BIT: # %bb.0: # %entry ; 32BIT-NEXT: stwu 1, -64(1) ; 32BIT-NEXT: .cfi_def_cfa_offset 64 -; 32BIT-NEXT: lxvw4x 34, 0, 3 -; 32BIT-NEXT: li 3, 0 -; 32BIT-NEXT: addi 4, 1, 16 -; 32BIT-NEXT: addi 5, 1, 32 +; 32BIT-NEXT: li 3, .LCPI0_0@l +; 32BIT-NEXT: lis 4, .LCPI0_0@ha +; 32BIT-NEXT: addi 5, 1, 16 ; 32BIT-NEXT: addi 6, 1, 48 ; 32BIT-NEXT: li 7, 0 +; 32BIT-NEXT: lxvw4x 34, 0, 3 +; 32BIT-NEXT: lxvw4x 35, 4, 3 +; 32BIT-NEXT: li 3, 0 +; 32BIT-NEXT: addi 4, 1, 32 ; 32BIT-NEXT: .p2align 4 ; 32BIT-NEXT: .LBB0_1: # %while.body ; 32BIT-NEXT: # -; 32BIT-NEXT: stw 7, 16(1) ; 32BIT-NEXT: stw 3, 32(1) -; 32BIT-NEXT: lxvw4x 0, 0, 4 -; 32BIT-NEXT: lxvw4x 1, 0, 5 -; 32BIT-NEXT: xxsldwi 0, 1, 0, 1 -; 32BIT-NEXT: xxspltw 1, 1, 0 -; 32BIT-NEXT: xxsldwi 35, 0, 1, 3 -; 32BIT-NEXT: vadduwm 3, 2, 3 -; 32BIT-NEXT: xxspltw 36, 35, 1 -; 32BIT-NEXT: vadduwm 3, 3, 4 -; 32BIT-NEXT: stxvw4x 35, 0, 6 +; 32BIT-NEXT: stw 7, 16(1) +; 32BIT-NEXT: lxvw4x 36, 0, 4 +; 32BIT-NEXT: lxvw4x 37, 0, 5 +; 32BIT-NEXT: vperm 4, 5, 4, 3 +; 32BIT-NEXT: vadduwm 4, 2, 4 +; 32BIT-NEXT: xxspltw 37, 36, 1 +; 32BIT-NEXT: vadduwm 4, 4, 5 +; 32BIT-NEXT: stxvw4x 36, 0, 6 ; 32BIT-NEXT: lwz 7, 48(1) ; 32BIT-NEXT: b .LBB0_1 ; diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll --- a/llvm/test/CodeGen/PowerPC/pr27078.ll +++ b/llvm/test/CodeGen/PowerPC/pr27078.ll @@ -4,23 +4,26 @@ define <4 x float> @bar(float* %p, float* %q) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvw4x 0, 0, 3 -; CHECK-NEXT: lxvw4x 1, 0, 4 ; CHECK-NEXT: li 5, 16 -; CHECK-NEXT: lxvw4x 2, 3, 5 -; CHECK-NEXT: lxvw4x 3, 4, 5 +; CHECK-NEXT: lxvw4x 2, 0, 3 +; CHECK-NEXT: lxvw4x 3, 0, 4 +; CHECK-NEXT: lxvw4x 0, 3, 5 +; CHECK-NEXT: lxvw4x 1, 4, 5 ; CHECK-NEXT: li 5, 32 -; CHECK-NEXT: lxvw4x 4, 4, 5 +; CHECK-NEXT: xvsubsp 35, 3, 2 +; CHECK-NEXT: xvsubsp 34, 1, 0 +; CHECK-NEXT: lxvw4x 0, 3, 5 +; CHECK-NEXT: lxvw4x 1, 4, 5 +; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l +; CHECK-NEXT: lxvw4x 36, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha ; CHECK-NEXT: xvsubsp 0, 1, 0 -; CHECK-NEXT: lxvw4x 1, 3, 5 -; CHECK-NEXT: xvsubsp 2, 3, 2 -; CHECK-NEXT: xvsubsp 1, 4, 1 -; CHECK-NEXT: xxsldwi 0, 0, 0, 1 -; CHECK-NEXT: xxmrglw 34, 0, 2 -; CHECK-NEXT: xxsldwi 0, 0, 34, 3 -; CHECK-NEXT: xxmrghw 34, 1, 1 -; CHECK-NEXT: xxsldwi 0, 34, 0, 3 -; CHECK-NEXT: xxsldwi 34, 0, 0, 1 +; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l +; CHECK-NEXT: vperm 2, 3, 2, 4 +; CHECK-NEXT: lxvw4x 36, 0, 3 +; CHECK-NEXT: xxmrghw 35, 0, 0 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: blr %1 = bitcast float* %p to <12 x float>* %2 = bitcast float* %q to <12 x float>* diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -35,10 +35,11 @@ ; ; P8BE-LABEL: s2v_test1: ; P8BE: # %bb.0: # %entry -; P8BE-NEXT: lfiwzx f0, 0, r3 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: lxsiwzx v4, 0, r3 +; P8BE-NEXT: addi r4, r4, .LCPI0_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r4 +; P8BE-NEXT: vperm v2, v4, v2, v3 ; P8BE-NEXT: blr entry: %0 = load i32, i32* %int32, align 4 @@ -74,11 +75,12 @@ ; ; P8BE-LABEL: s2v_test2: ; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha ; P8BE-NEXT: addi r3, r3, 4 -; P8BE-NEXT: lfiwzx f0, 0, r3 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addi r4, r4, .LCPI1_0@toc@l +; P8BE-NEXT: lxsiwzx v4, 0, r3 +; P8BE-NEXT: lxvw4x v3, 0, r4 +; P8BE-NEXT: vperm v2, v4, v2, v3 ; P8BE-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 @@ -117,11 +119,12 @@ ; ; P8BE-LABEL: s2v_test3: ; P8BE: # %bb.0: # %entry -; P8BE-NEXT: sldi r4, r7, 2 -; P8BE-NEXT: lfiwzx f0, r3, r4 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: sldi r5, r7, 2 +; P8BE-NEXT: addi r4, r4, .LCPI2_0@toc@l +; P8BE-NEXT: lxsiwzx v3, r3, r5 +; P8BE-NEXT: lxvw4x v4, 0, r4 +; P8BE-NEXT: vperm v2, v3, v2, v4 ; P8BE-NEXT: blr entry: %idxprom = sext i32 %Idx to i64 @@ -159,11 +162,12 @@ ; ; P8BE-LABEL: s2v_test4: ; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha ; P8BE-NEXT: addi r3, r3, 4 -; P8BE-NEXT: lfiwzx f0, 0, r3 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addi r4, r4, .LCPI3_0@toc@l +; P8BE-NEXT: lxsiwzx v4, 0, r3 +; P8BE-NEXT: lxvw4x v3, 0, r4 +; P8BE-NEXT: vperm v2, v4, v2, v3 ; P8BE-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 @@ -199,10 +203,11 @@ ; ; P8BE-LABEL: s2v_test5: ; P8BE: # %bb.0: # %entry -; P8BE-NEXT: lfiwzx f0, 0, r5 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8BE-NEXT: lxsiwzx v4, 0, r5 +; P8BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: vperm v2, v4, v2, v3 ; P8BE-NEXT: blr entry: %0 = load i32, i32* %ptr1, align 4 @@ -237,10 +242,11 @@ ; ; P8BE-LABEL: s2v_test_f1: ; P8BE: # %bb.0: # %entry -; P8BE-NEXT: lfiwzx f0, 0, r3 -; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1 -; P8BE-NEXT: xxmrghw v2, v2, vs0 -; P8BE-NEXT: xxsldwi v2, v2, vs1, 3 +; P8BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; P8BE-NEXT: lxsiwzx v4, 0, r3 +; P8BE-NEXT: addi r4, r4, .LCPI5_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r4 +; P8BE-NEXT: vperm v2, v4, v2, v3 ; P8BE-NEXT: blr entry: %0 = load float, float* %f64, align 4 diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll --- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll +++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll @@ -55,21 +55,24 @@ ; CHECK-BE-P7: # %bb.0: # %entry ; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 +; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI0_0@toc@ha ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 ; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P7-NEXT: stw r3, -32(r1) -; CHECK-BE-P7-NEXT: addi r3, r1, -32 -; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3 -; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1 +; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI0_0@toc@l +; CHECK-BE-P7-NEXT: addi r4, r1, -32 +; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0 -; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3 -; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1 +; CHECK-BE-P8-NEXT: xscvdpsxws v3, f1 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test: @@ -118,21 +121,24 @@ ; CHECK-BE-P7: # %bb.0: # %entry ; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 +; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI1_0@toc@ha ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 ; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P7-NEXT: stw r3, -32(r1) -; CHECK-BE-P7-NEXT: addi r3, r1, -32 -; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3 -; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1 +; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI1_0@toc@l +; CHECK-BE-P7-NEXT: addi r4, r1, -32 +; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test2: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0 -; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3 -; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1 +; CHECK-BE-P8-NEXT: xscvdpsxws v3, f1 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test2: @@ -181,21 +187,24 @@ ; CHECK-BE-P7: # %bb.0: # %entry ; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 +; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI2_0@toc@ha ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 ; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P7-NEXT: stw r3, -32(r1) -; CHECK-BE-P7-NEXT: addi r3, r1, -32 -; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3 -; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1 +; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI2_0@toc@l +; CHECK-BE-P7-NEXT: addi r4, r1, -32 +; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test3: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xscvdpuxws f0, f1 -; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0 -; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3 -; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1 +; CHECK-BE-P8-NEXT: xscvdpuxws v3, f1 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test3: @@ -244,21 +253,24 @@ ; CHECK-BE-P7: # %bb.0: # %entry ; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1 ; CHECK-BE-P7-NEXT: addi r3, r1, -4 +; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI3_0@toc@ha ; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3 ; CHECK-BE-P7-NEXT: lwz r3, -4(r1) -; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P7-NEXT: stw r3, -32(r1) -; CHECK-BE-P7-NEXT: addi r3, r1, -32 -; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3 -; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1 +; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI3_0@toc@l +; CHECK-BE-P7-NEXT: addi r4, r1, -32 +; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-P7-NEXT: blr ; ; CHECK-BE-P8-LABEL: test4: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xscvdpuxws f0, f1 -; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0 -; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3 -; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1 +; CHECK-BE-P8-NEXT: xscvdpuxws v3, f1 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test4: diff --git a/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll b/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll --- a/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll +++ b/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll @@ -190,10 +190,12 @@ ; CHECK-BE-LABEL: test10: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis 3, 2, .LCPI9_0@toc@ha -; CHECK-BE-NEXT: xxmrghw 0, 35, 35 -; CHECK-BE-NEXT: lfs 1, .LCPI9_0@toc@l(3) -; CHECK-BE-NEXT: xxmrglw 0, 0, 34 -; CHECK-BE-NEXT: xsadddp 1, 0, 1 +; CHECK-BE-NEXT: addi 3, 3, .LCPI9_0@toc@l +; CHECK-BE-NEXT: lxv 36, 0(3) +; CHECK-BE-NEXT: addis 3, 2, .LCPI9_1@toc@ha +; CHECK-BE-NEXT: lfs 0, .LCPI9_1@toc@l(3) +; CHECK-BE-NEXT: vperm 2, 3, 2, 4 +; CHECK-BE-NEXT: xsadddp 1, 34, 0 ; CHECK-BE-NEXT: blr entry: %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> diff --git a/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll b/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll @@ -1,4 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vperm +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vperm + +; TODO: Fix this case when disabling perfect shuffle define <4 x float> @test_uu72(<4 x float>* %P1, <4 x float>* %P2) { %V1 = load <4 x float>, <4 x float>* %P1 ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll --- a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll +++ b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll @@ -2,7 +2,7 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu -mattr=+power8-vector < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -mattr=+power8-vector < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck -check-prefix=CHECK-PWR7 %s -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi < %s | FileCheck -check-prefix=CHECK-PWR7 %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi < %s | FileCheck -check-prefix=CHECK-PWR7-AIX %s define void @VPKUDUM_unary(<2 x i64>* %A) { ; CHECK-LABEL: VPKUDUM_unary: @@ -14,12 +14,22 @@ ; ; CHECK-PWR7-LABEL: VPKUDUM_unary: ; CHECK-PWR7: # %bb.0: # %entry -; CHECK-PWR7-NEXT: lxvw4x 0, 0, 3 -; CHECK-PWR7-NEXT: xxmrglw 1, 0, 0 -; CHECK-PWR7-NEXT: xxmrghw 0, 0, 0 -; CHECK-PWR7-NEXT: xxmrglw 0, 0, 1 -; CHECK-PWR7-NEXT: stxvw4x 0, 0, 3 +; CHECK-PWR7-NEXT: addis 4, 2, .LCPI0_0@toc@ha +; CHECK-PWR7-NEXT: lxvw4x 34, 0, 3 +; CHECK-PWR7-NEXT: addi 4, 4, .LCPI0_0@toc@l +; CHECK-PWR7-NEXT: lxvw4x 35, 0, 4 +; CHECK-PWR7-NEXT: vperm 2, 2, 2, 3 +; CHECK-PWR7-NEXT: stxvw4x 34, 0, 3 ; CHECK-PWR7-NEXT: blr +; +; CHECK-PWR7-AIX-LABEL: VPKUDUM_unary: +; CHECK-PWR7-AIX: # %bb.0: # %entry +; CHECK-PWR7-AIX-NEXT: ld 4, L..C0(2) # %const.0 +; CHECK-PWR7-AIX-NEXT: lxvw4x 34, 0, 3 +; CHECK-PWR7-AIX-NEXT: lxvw4x 35, 0, 4 +; CHECK-PWR7-AIX-NEXT: vperm 2, 2, 2, 3 +; CHECK-PWR7-AIX-NEXT: stxvw4x 34, 0, 3 +; CHECK-PWR7-AIX-NEXT: blr entry: %tmp = load <2 x i64>, <2 x i64>* %A %tmp2 = bitcast <2 x i64> %tmp to <4 x i32> @@ -45,13 +55,24 @@ ; ; CHECK-PWR7-LABEL: VPKUDUM: ; CHECK-PWR7: # %bb.0: # %entry -; CHECK-PWR7-NEXT: lxvw4x 0, 0, 3 -; CHECK-PWR7-NEXT: lxvw4x 1, 0, 4 -; CHECK-PWR7-NEXT: xxmrglw 2, 0, 1 -; CHECK-PWR7-NEXT: xxmrghw 0, 0, 1 -; CHECK-PWR7-NEXT: xxmrglw 0, 0, 2 -; CHECK-PWR7-NEXT: stxvw4x 0, 0, 3 +; CHECK-PWR7-NEXT: addis 5, 2, .LCPI1_0@toc@ha +; CHECK-PWR7-NEXT: lxvw4x 34, 0, 4 +; CHECK-PWR7-NEXT: lxvw4x 35, 0, 3 +; CHECK-PWR7-NEXT: addi 4, 5, .LCPI1_0@toc@l +; CHECK-PWR7-NEXT: lxvw4x 36, 0, 4 +; CHECK-PWR7-NEXT: vperm 2, 3, 2, 4 +; CHECK-PWR7-NEXT: stxvw4x 34, 0, 3 ; CHECK-PWR7-NEXT: blr +; +; CHECK-PWR7-AIX-LABEL: VPKUDUM: +; CHECK-PWR7-AIX: # %bb.0: # %entry +; CHECK-PWR7-AIX-NEXT: ld 5, L..C1(2) # %const.0 +; CHECK-PWR7-AIX-NEXT: lxvw4x 34, 0, 4 +; CHECK-PWR7-AIX-NEXT: lxvw4x 35, 0, 3 +; CHECK-PWR7-AIX-NEXT: lxvw4x 36, 0, 5 +; CHECK-PWR7-AIX-NEXT: vperm 2, 3, 2, 4 +; CHECK-PWR7-AIX-NEXT: stxvw4x 34, 0, 3 +; CHECK-PWR7-AIX-NEXT: blr entry: %tmp = load <2 x i64>, <2 x i64>* %A %tmp2 = bitcast <2 x i64> %tmp to <4 x i32>