diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -59,7 +59,7 @@ CCIfType<[f32], CCAssignToReg<[F1]>>, CCIfType<[f64], CCAssignToReg<[F1]>>, - CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, + CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", @@ -92,7 +92,7 @@ // For P9, f128 are passed in vector registers. CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, // Vector types returned as "direct" go into V2 .. V9; note that only the @@ -149,7 +149,7 @@ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", @@ -216,7 +216,7 @@ // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, - CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> + CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToStack<16, 16>>> ]>; // This calling convention puts vector arguments always on the stack. It is used @@ -238,7 +238,7 @@ // Float128 types treated as vector arguments. CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>>, CCDelegateTo diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -121,6 +121,11 @@ static cl::opt UseAbsoluteJumpTables("ppc-use-absolute-jumptables", cl::desc("use absolute jump tables on ppc"), cl::Hidden); +// TODO - Remove this option if soft fp128 has been fully supported . +static cl::opt + EnableSoftFP128("enable-soft-fp128", + cl::desc("temp option to enable soft fp128"), cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); @@ -1161,6 +1166,32 @@ setOperationAction(ISD::BSWAP, MVT::v4i32, Legal); setOperationAction(ISD::BSWAP, MVT::v2i64, Legal); setOperationAction(ISD::BSWAP, MVT::v1i128, Legal); + } else if (Subtarget.hasAltivec() && EnableSoftFP128) { + addRegisterClass(MVT::f128, &PPC::VRRCRegClass); + + for (MVT FPT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand); + + setOperationAction(ISD::LOAD, MVT::f128, Promote); + setOperationAction(ISD::STORE, MVT::f128, Promote); + + AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32); + AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32); + + setOperationAction(ISD::FADD, MVT::f128, Expand); + setOperationAction(ISD::FSUB, MVT::f128, Expand); + setOperationAction(ISD::FMUL, MVT::f128, Expand); + setOperationAction(ISD::FDIV, MVT::f128, Expand); + setOperationAction(ISD::FNEG, MVT::f128, Expand); + setOperationAction(ISD::FABS, MVT::f128, Expand); + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FPOW, MVT::f128, Expand); + setOperationAction(ISD::FPOWI, MVT::f128, Expand); + setOperationAction(ISD::FREM, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); } if (Subtarget.hasP9Altivec()) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -933,6 +933,18 @@ def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>; def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v16i8 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VRRC:$src))), (f128 VRRC:$src)>; + +def : Pat<(v16i8 (bitconvert (f128 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (f128 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (f128 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (f128 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v2f64 (bitconvert (f128 VRRC:$src))), (v2f64 VRRC:$src)>; + // Max/Min def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)), (v16i8 (VMAXUB $src1, $src2))>; diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll --- a/llvm/test/CodeGen/PowerPC/f128-arith.ll +++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll @@ -2,7 +2,7 @@ ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ ; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ -; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s -enable-soft-fp128 | FileCheck %s \ ; RUN: -check-prefix=CHECK-P8 ; Function Attrs: norecurse nounwind @@ -23,15 +23,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 +; CHECK-P8-NEXT: vmr v3, v2 ; CHECK-P8-NEXT: bl __addkf3 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -62,15 +59,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 +; CHECK-P8-NEXT: vmr v3, v2 ; CHECK-P8-NEXT: bl __subkf3 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -101,15 +95,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 +; CHECK-P8-NEXT: vmr v3, v2 ; CHECK-P8-NEXT: bl __mulkf3 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -140,15 +131,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 +; CHECK-P8-NEXT: vmr v3, v2 ; CHECK-P8-NEXT: bl __divkf3 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -173,9 +161,9 @@ ; CHECK-P8-LABEL: testLdNSt: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: addi r3, r3, 4 -; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: addi r3, r4, 8 -; CHECK-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: blr entry: %add.ptr = getelementptr inbounds i8, i8* %PtrC, i64 4 @@ -205,15 +193,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl sqrtl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -239,13 +223,18 @@ ; ; CHECK-P8-LABEL: qpCpsgn: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: ld r6, 0(r3) -; CHECK-P8-NEXT: ld r3, 8(r3) -; CHECK-P8-NEXT: rotldi r4, r4, 1 -; CHECK-P8-NEXT: rldimi r3, r4, 63, 0 -; CHECK-P8-NEXT: std r6, 0(r5) -; CHECK-P8-NEXT: std r3, 8(r5) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 +; CHECK-P8-NEXT: addi r3, r1, -16 +; CHECK-P8-NEXT: addi r4, r1, -32 +; CHECK-P8-NEXT: stvx v3, 0, r3 +; CHECK-P8-NEXT: stvx v2, 0, r4 +; CHECK-P8-NEXT: lbz r3, -1(r1) +; CHECK-P8-NEXT: lbz r6, -17(r1) +; CHECK-P8-NEXT: rlwimi r6, r3, 0, 0, 24 +; CHECK-P8-NEXT: stb r6, -17(r1) +; CHECK-P8-NEXT: lvx v2, 0, r4 +; CHECK-P8-NEXT: stvx v2, 0, r5 ; CHECK-P8-NEXT: blr fp128* nocapture %res) { entry: @@ -268,11 +257,14 @@ ; ; CHECK-P8-LABEL: qpAbs: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r3, 8(r3) -; CHECK-P8-NEXT: clrldi r3, r3, 1 -; CHECK-P8-NEXT: std r5, 0(r4) -; CHECK-P8-NEXT: std r3, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: addi r3, r1, -16 +; CHECK-P8-NEXT: stvx v2, 0, r3 +; CHECK-P8-NEXT: lbz r5, -1(r1) +; CHECK-P8-NEXT: clrlwi r5, r5, 25 +; CHECK-P8-NEXT: stb r5, -1(r1) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: stvx v2, 0, r4 ; CHECK-P8-NEXT: blr entry: %0 = load fp128, fp128* %a, align 16 @@ -293,12 +285,20 @@ ; ; CHECK-P8-LABEL: qpNAbs: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: ld r6, 0(r3) -; CHECK-P8-NEXT: ld r3, 8(r3) -; CHECK-P8-NEXT: li r5, -1 -; CHECK-P8-NEXT: rldimi r3, r5, 63, 0 -; CHECK-P8-NEXT: std r6, 0(r4) -; CHECK-P8-NEXT: std r3, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: addi r3, r1, -32 +; CHECK-P8-NEXT: stvx v2, 0, r3 +; CHECK-P8-NEXT: lbz r5, -17(r1) +; CHECK-P8-NEXT: clrlwi r5, r5, 25 +; CHECK-P8-NEXT: stb r5, -17(r1) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: addi r3, r1, -16 +; CHECK-P8-NEXT: stvx v2, 0, r3 +; CHECK-P8-NEXT: lbz r5, -1(r1) +; CHECK-P8-NEXT: xori r5, r5, 128 +; CHECK-P8-NEXT: stb r5, -1(r1) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: stvx v2, 0, r4 ; CHECK-P8-NEXT: blr entry: %0 = load fp128, fp128* %a, align 16 @@ -319,13 +319,14 @@ ; ; CHECK-P8-LABEL: qpNeg: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: li r5, 1 -; CHECK-P8-NEXT: ld r6, 0(r3) -; CHECK-P8-NEXT: ld r3, 8(r3) -; CHECK-P8-NEXT: sldi r5, r5, 63 -; CHECK-P8-NEXT: xor r3, r3, r5 -; CHECK-P8-NEXT: std r6, 0(r4) -; CHECK-P8-NEXT: std r3, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: addi r3, r1, -16 +; CHECK-P8-NEXT: stvx v2, 0, r3 +; CHECK-P8-NEXT: lbz r5, -1(r1) +; CHECK-P8-NEXT: xori r5, r5, 128 +; CHECK-P8-NEXT: stb r5, -1(r1) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: stvx v2, 0, r4 ; CHECK-P8-NEXT: blr entry: %0 = load fp128, fp128* %a, align 16 @@ -358,9 +359,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl sinf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -397,9 +396,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl cosf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -436,9 +433,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl logf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -475,9 +470,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl log10f128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -514,9 +507,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl log2f128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -554,12 +545,8 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r7, 0(r3) -; CHECK-P8-NEXT: ld r8, 8(r3) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) -; CHECK-P8-NEXT: mr r3, r7 -; CHECK-P8-NEXT: mr r4, r8 +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl fminf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -599,12 +586,8 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r7, 0(r3) -; CHECK-P8-NEXT: ld r8, 8(r3) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) -; CHECK-P8-NEXT: mr r3, r7 -; CHECK-P8-NEXT: mr r4, r8 +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl fmaxf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -644,12 +627,8 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r7, 0(r3) -; CHECK-P8-NEXT: ld r8, 8(r3) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) -; CHECK-P8-NEXT: mr r3, r7 -; CHECK-P8-NEXT: mr r4, r8 +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl powf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -688,9 +667,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl expf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -727,9 +704,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl exp2f128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -774,17 +749,13 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r6, 0(r3) -; CHECK-P8-NEXT: ld r7, 8(r3) -; CHECK-P8-NEXT: lwz r8, 0(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lwz r3, 0(r4) ; CHECK-P8-NEXT: mr r30, r5 -; CHECK-P8-NEXT: mr r3, r6 -; CHECK-P8-NEXT: mr r4, r7 -; CHECK-P8-NEXT: mr r5, r8 +; CHECK-P8-NEXT: mr r5, r3 ; CHECK-P8-NEXT: bl __powikf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -831,14 +802,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a@toc@l -; CHECK-P8-NEXT: addi r7, r4, b@toc@l -; CHECK-P8-NEXT: ld r3, a@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a@toc@l +; CHECK-P8-NEXT: addi r4, r4, b@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl fmodf128 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -869,15 +838,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl ceill ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -908,15 +873,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl floorl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -947,15 +908,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl truncl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -986,15 +943,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl roundl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1038,11 +991,8 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl lroundl ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stw r3, 0(r30) @@ -1089,11 +1039,8 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl llroundl ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: std r3, 0(r30) @@ -1127,15 +1074,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl rintl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1179,11 +1122,8 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl lrintl ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stw r3, 0(r30) @@ -1230,11 +1170,8 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl llrintl ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: std r3, 0(r30) @@ -1268,15 +1205,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl nearbyintl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1309,21 +1242,13 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r11, 0(r3) -; CHECK-P8-NEXT: ld r12, 8(r3) -; CHECK-P8-NEXT: ld r9, 0(r4) -; CHECK-P8-NEXT: ld r10, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 +; CHECK-P8-NEXT: lvx v4, 0, r5 ; CHECK-P8-NEXT: mr r30, r6 -; CHECK-P8-NEXT: ld r7, 0(r5) -; CHECK-P8-NEXT: ld r8, 8(r5) -; CHECK-P8-NEXT: mr r3, r11 -; CHECK-P8-NEXT: mr r4, r12 -; CHECK-P8-NEXT: mr r5, r9 -; CHECK-P8-NEXT: mr r6, r10 ; CHECK-P8-NEXT: bl fmal ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload