diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -428,7 +428,7 @@ } if (Triple.isOSAIX() || Triple.isOSLinux()) - DataLayout += "-v256:256:256-v512:512:512"; + DataLayout += "-S128-v256:256:256-v512:512:512"; resetDataLayout(DataLayout); // PPC64 supports atomics up to 8 bytes. diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -140,27 +140,27 @@ // RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=PPC64-LINUX -// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu future %s | \ // RUN: FileCheck %s -check-prefix=PPC64-FUTURE -// PPC64-FUTURE: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64-FUTURE: target datalayout = "E-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu pwr10 %s | \ // RUN: FileCheck %s -check-prefix=PPC64-P10 -// PPC64-P10: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64-P10: target datalayout = "E-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=PPC64LE-LINUX -// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu future %s | \ // RUN: FileCheck %s -check-prefix=PPC64LE-FUTURE -// PPC64LE-FUTURE: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64LE-FUTURE: target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu pwr10 %s | \ // RUN: FileCheck %s -check-prefix=PPC64LE-P10 -// PPC64LE-P10: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +// PPC64LE-P10: target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX diff --git a/lld/test/ELF/common-archive-lookup.s b/lld/test/ELF/common-archive-lookup.s --- a/lld/test/ELF/common-archive-lookup.s +++ b/lld/test/ELF/common-archive-lookup.s @@ -162,13 +162,13 @@ #--- blockdata.ll -target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" target triple = "powerpc64le-unknown-linux-gnu" @block = dso_local local_unnamed_addr global [5 x i32] [i32 5, i32 0, i32 0, i32 0, i32 0], align 4 #--- commonblock.ll -target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" +target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" target triple = "powerpc64le-unknown-linux-gnu" @block = common dso_local local_unnamed_addr global [5 x i32] zeroinitializer, align 4 diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -157,9 +157,8 @@ // Specify the vector alignment explicitly. For v256i1 and v512i1, the // calculated alignment would be 256*alignment(i1) and 512*alignment(i1), // which is 256 and 512 bytes - way over aligned. - if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) && - (T.isOSAIX() || T.isOSLinux())) - Ret += "-v256:256:256-v512:512:512"; + if (is64Bit && (T.isOSAIX() || T.isOSLinux())) + Ret += "-S128-v256:256:256-v512:512:512"; return Ret; } diff --git a/llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll b/llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE +; RUN: opt --passes=sroa,loop-vectorize,loop-unroll,instcombine -S \ +; RUN: -vectorizer-maximize-bandwidth --mtriple=powerpc64le-- -mcpu=pwr10 < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-OPT + +target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" + +define dso_local signext i32 @test_32byte_vector() nounwind { +; CHECK-LE-LABEL: test_32byte_vector: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mflr r0 +; CHECK-LE-NEXT: std r30, -16(r1) +; CHECK-LE-NEXT: mr r30, r1 +; CHECK-LE-NEXT: std r0, 16(r1) +; CHECK-LE-NEXT: clrldi r0, r1, 59 +; CHECK-LE-NEXT: subfic r0, r0, -96 +; CHECK-LE-NEXT: stdux r1, r1, r0 +; CHECK-LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-LE-NEXT: addis r4, r2, .LCPI0_1@toc@ha +; CHECK-LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-LE-NEXT: addi r4, r4, .LCPI0_1@toc@l +; CHECK-LE-NEXT: lvx v2, 0, r3 +; CHECK-LE-NEXT: lvx v3, 0, r4 +; CHECK-LE-NEXT: addi r4, r1, 48 +; CHECK-LE-NEXT: addi r3, r1, 32 +; CHECK-LE-NEXT: stvx v2, 0, r4 +; CHECK-LE-NEXT: stvx v3, 0, r3 +; CHECK-LE-NEXT: bl test +; CHECK-LE-NEXT: nop +; CHECK-LE-NEXT: lwa r3, 32(r1) +; CHECK-LE-NEXT: mr r1, r30 +; CHECK-LE-NEXT: ld r0, 16(r1) +; CHECK-LE-NEXT: ld r30, -16(r1) +; CHECK-LE-NEXT: mtlr r0 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_32byte_vector: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mflr r0 +; CHECK-BE-NEXT: std r30, -16(r1) +; CHECK-BE-NEXT: std r0, 16(r1) +; CHECK-BE-NEXT: clrldi r0, r1, 59 +; CHECK-BE-NEXT: mr r30, r1 +; CHECK-BE-NEXT: subfic r0, r0, -192 +; CHECK-BE-NEXT: stdux r1, r1, r0 +; CHECK-BE-NEXT: lis r3, -8192 +; CHECK-BE-NEXT: li r4, 5 +; CHECK-BE-NEXT: lis r5, -16384 +; CHECK-BE-NEXT: lis r6, -32768 +; CHECK-BE-NEXT: ori r3, r3, 1 +; CHECK-BE-NEXT: rldic r4, r4, 32, 29 +; CHECK-BE-NEXT: ori r5, r5, 1 +; CHECK-BE-NEXT: ori r6, r6, 1 +; CHECK-BE-NEXT: rldic r3, r3, 3, 29 +; CHECK-BE-NEXT: ori r4, r4, 6 +; CHECK-BE-NEXT: rldic r5, r5, 2, 30 +; CHECK-BE-NEXT: rldic r6, r6, 1, 31 +; CHECK-BE-NEXT: std r3, 152(r1) +; CHECK-BE-NEXT: addi r3, r1, 128 +; CHECK-BE-NEXT: std r4, 144(r1) +; CHECK-BE-NEXT: std r5, 136(r1) +; CHECK-BE-NEXT: std r6, 128(r1) +; CHECK-BE-NEXT: bl test +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: lwa r3, 128(r1) +; CHECK-BE-NEXT: mr r1, r30 +; CHECK-BE-NEXT: ld r0, 16(r1) +; CHECK-BE-NEXT: ld r30, -16(r1) +; CHECK-BE-NEXT: mtlr r0 +; CHECK-BE-NEXT: blr +entry: + %a = alloca <8 x i32>, align 32 + %0 = bitcast <8 x i32>* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %0) + store <8 x i32> , <8 x i32>* %a, align 32 + call void @test(<8 x i32>* %a) + %1 = load <8 x i32>, <8 x i32>* %a, align 32 + %vecext = extractelement <8 x i32> %1, i32 0 + %2 = bitcast <8 x i32>* %a to i8* + call void @llvm.lifetime.end.p0i8(i64 32, i8* %2) + ret i32 %vecext +} + +define dso_local signext i32 @test_32byte_aligned_vector() nounwind { +; CHECK-LE-LABEL: test_32byte_aligned_vector: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mflr r0 +; CHECK-LE-NEXT: std r30, -16(r1) +; CHECK-LE-NEXT: mr r30, r1 +; CHECK-LE-NEXT: std r0, 16(r1) +; CHECK-LE-NEXT: clrldi r0, r1, 59 +; CHECK-LE-NEXT: subfic r0, r0, -64 +; CHECK-LE-NEXT: stdux r1, r1, r0 +; CHECK-LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-LE-NEXT: lvx v2, 0, r3 +; CHECK-LE-NEXT: addi r3, r1, 32 +; CHECK-LE-NEXT: stvx v2, 0, r3 +; CHECK-LE-NEXT: bl test1 +; CHECK-LE-NEXT: nop +; CHECK-LE-NEXT: lwa r3, 32(r1) +; CHECK-LE-NEXT: mr r1, r30 +; CHECK-LE-NEXT: ld r0, 16(r1) +; CHECK-LE-NEXT: ld r30, -16(r1) +; CHECK-LE-NEXT: mtlr r0 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: test_32byte_aligned_vector: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mflr r0 +; CHECK-BE-NEXT: std r30, -16(r1) +; CHECK-BE-NEXT: std r0, 16(r1) +; CHECK-BE-NEXT: clrldi r0, r1, 59 +; CHECK-BE-NEXT: mr r30, r1 +; CHECK-BE-NEXT: subfic r0, r0, -160 +; CHECK-BE-NEXT: stdux r1, r1, r0 +; CHECK-BE-NEXT: lis r3, -16384 +; CHECK-BE-NEXT: lis r4, -32768 +; CHECK-BE-NEXT: ori r3, r3, 1 +; CHECK-BE-NEXT: ori r4, r4, 1 +; CHECK-BE-NEXT: rldic r3, r3, 2, 30 +; CHECK-BE-NEXT: rldic r4, r4, 1, 31 +; CHECK-BE-NEXT: std r3, 136(r1) +; CHECK-BE-NEXT: addi r3, r1, 128 +; CHECK-BE-NEXT: std r4, 128(r1) +; CHECK-BE-NEXT: bl test1 +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: lwa r3, 128(r1) +; CHECK-BE-NEXT: mr r1, r30 +; CHECK-BE-NEXT: ld r0, 16(r1) +; CHECK-BE-NEXT: ld r30, -16(r1) +; CHECK-BE-NEXT: mtlr r0 +; CHECK-BE-NEXT: blr +entry: + %a = alloca <4 x i32>, align 32 + %0 = bitcast <4 x i32>* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) + store <4 x i32> , <4 x i32>* %a, align 32 + call void @test1(<4 x i32>* %a) + %1 = load <4 x i32>, <4 x i32>* %a, align 32 + %vecext = extractelement <4 x i32> %1, i32 0 + %2 = bitcast <4 x i32>* %a to i8* + call void @llvm.lifetime.end.p0i8(i64 16, i8* %2) + ret i32 %vecext +} + + +@Arr1 = dso_local global [64 x i8] zeroinitializer, align 1 + +define dso_local void @test_Array() nounwind { +; CHECK-OPT-LABEL: @test_Array( +; CHECK-OPT-NEXT: entry: +; CHECK-OPT-NEXT: %Arr2 = alloca [64 x i16], align 2 +; CHECK-OPT: store <16 x i16> [[TMP0:%.*]], <16 x i16>* [[TMP0:%.*]], align 2 + +entry: + %Arr2 = alloca [64 x i16], align 2 + %i = alloca i32, align 4 + %0 = bitcast [64 x i16]* %Arr2 to i8* + call void @llvm.lifetime.start.p0i8(i64 128, i8* %0) + %1 = bitcast i32* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %2 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %2, 64 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %3 = bitcast i32* %i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) + br label %for.end + +for.body: ; preds = %for.cond + %4 = load i32, i32* %i, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @Arr1, i64 0, i64 %idxprom + %5 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %5 to i16 + %6 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %6 to i64 + %arrayidx2 = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 %idxprom1 + store i16 %conv, i16* %arrayidx2, align 2 + br label %for.inc + +for.inc: ; preds = %for.body + %7 = load i32, i32* %i, align 4 + %inc = add nsw i32 %7, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond.cleanup + %arraydecay = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 0 + call void @test_arr(i16* %arraydecay) + %8 = bitcast [64 x i16]* %Arr2 to i8* + call void @llvm.lifetime.end.p0i8(i64 128, i8* %8) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) nounwind + +declare void @test(<8 x i32>*) nounwind +declare void @test1(<4 x i32>*) nounwind +declare void @test_arr(i16*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) nounwind