Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2451,7 +2451,8 @@ // We care about the legality of the operation after it has been type // legalized. - while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal) + while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal && + VT != TLI.getTypeToTransformTo(Ctx, VT)) VT = TLI.getTypeToTransformTo(Ctx, VT); // If the vselect is legal, assume we want to leave this as a vector setcc + Index: llvm/trunk/lib/Target/X86/X86CallingConv.td =================================================================== --- llvm/trunk/lib/Target/X86/X86CallingConv.td +++ llvm/trunk/lib/Target/X86/X86CallingConv.td @@ -158,6 +158,7 @@ // The X86-64 calling convention always returns FP values in XMM0. CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, @@ -293,7 +294,7 @@ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -318,7 +319,7 @@ // Long doubles get stack slots whose size and alignment depends on the // subtarget. - CCIfType<[f80], CCAssignToStack<0, 0>>, + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -296,6 +296,7 @@ setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -303,6 +304,7 @@ setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -415,12 +417,14 @@ setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); setOperationAction(ISD::SETCCE , MVT::i8 , Custom); setOperationAction(ISD::SETCCE , MVT::i16 , Custom); setOperationAction(ISD::SETCCE , MVT::i32 , Custom); @@ -619,8 +623,16 @@ setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -2363,7 +2375,7 @@ EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2647,6 +2659,8 @@ RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -13410,6 +13424,8 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -13422,6 +13438,11 @@ LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -13453,7 +13474,7 @@ IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -13472,6 +13493,7 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -13486,13 +13508,16 @@ // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -13505,12 +13530,13 @@ // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -13519,8 +13545,9 @@ APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -13536,13 +13563,15 @@ false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -22158,6 +22187,7 @@ return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: + case X86::CMOV_FR128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -23821,7 +23851,8 @@ // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -27946,6 +27977,7 @@ case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -28058,6 +28090,7 @@ // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Index: llvm/trunk/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrCompiler.td +++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td @@ -512,6 +512,7 @@ defm _FR32 : CMOVrr_PSEUDO; defm _FR64 : CMOVrr_PSEUDO; + defm _FR128 : CMOVrr_PSEUDO; defm _V4F32 : CMOVrr_PSEUDO; defm _V2F64 : CMOVrr_PSEUDO; defm _V2I64 : CMOVrr_PSEUDO; Index: llvm/trunk/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.td +++ llvm/trunk/lib/Target/X86/X86InstrInfo.td @@ -955,11 +955,12 @@ return false; }]>; -def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; -def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; -def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; -def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; -def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -413,6 +413,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; + def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; } // Bitcasts between 256-bit vector types. Return the original type since @@ -8851,3 +8853,59 @@ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; } } + +//===----------------------------------------------------------------------===// +// Extra selection patterns for FR128, f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(store (f128 FR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; + +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fand FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(and FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86for FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(or FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fxor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(xor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; Index: llvm/trunk/lib/Target/X86/X86RegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/X86/X86RegisterInfo.td +++ llvm/trunk/lib/Target/X86/X86RegisterInfo.td @@ -423,6 +423,8 @@ def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>; + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill Index: llvm/trunk/test/CodeGen/X86/fp128-calling-conv.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-calling-conv.ll +++ llvm/trunk/test/CodeGen/X86/fp128-calling-conv.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; __float128 myFP128 = 1.0L; // x86_64-linux-android +@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16 + +; The first few parameters are passed in registers and the other are on stack. + +define fp128 @TestParam_FP128_0(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) { +entry: + ret fp128 %d0 +; CHECK-LABEL: TestParam_FP128_0: +; CHECK-NOT: mov +; CHECK: retq +} + +define fp128 @TestParam_FP128_1(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) { +entry: + ret fp128 %d1 +; CHECK-LABEL: TestParam_FP128_1: +; CHECK: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +} + +define fp128 @TestParam_FP128_7(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) { +entry: + ret fp128 %d7 +; CHECK-LABEL: TestParam_FP128_7: +; CHECK: movaps %xmm7, %xmm0 +; CHECK-NEXT: retq +} + +define fp128 @TestParam_FP128_8(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) { +entry: + ret fp128 %d8 +; CHECK-LABEL: TestParam_FP128_8: +; CHECK: movaps 8(%rsp), %xmm0 +; CHECK-NEXT: retq +} + +define fp128 @TestParam_FP128_9(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) { +entry: + ret fp128 %d9 +; CHECK-LABEL: TestParam_FP128_9: +; CHECK: movaps 24(%rsp), %xmm0 +; CHECK-NEXT: retq +} Index: llvm/trunk/test/CodeGen/X86/fp128-cast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-cast.ll +++ llvm/trunk/test/CodeGen/X86/fp128-cast.ll @@ -0,0 +1,279 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; Check soft floating point conversion function calls. + +@vi32 = common global i32 0, align 4 +@vi64 = common global i64 0, align 8 +@vu32 = common global i32 0, align 4 +@vu64 = common global i64 0, align 8 +@vf32 = common global float 0.000000e+00, align 4 +@vf64 = common global double 0.000000e+00, align 8 +@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16 + +define void @TestFPExtF32_F128() { +entry: + %0 = load float, float* @vf32, align 4 + %conv = fpext float %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestFPExtF32_F128: +; CHECK: movss vf32(%rip), %xmm0 +; CHECK-NEXT: callq __extendsftf2 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @TestFPExtF64_F128() { +entry: + %0 = load double, double* @vf64, align 8 + %conv = fpext double %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestFPExtF64_F128: +; CHECK: movsd vf64(%rip), %xmm0 +; CHECK-NEXT: callq __extenddftf2 +; CHECK-NEXT: movapd %xmm0, vf128(%rip) +; CHECK: ret +} + +define void @TestFPToSIF128_I32() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptosi fp128 %0 to i32 + store i32 %conv, i32* @vi32, align 4 + ret void +; CHECK-LABEL: TestFPToSIF128_I32: +; CHECK: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __fixtfsi +; CHECK-NEXT: movl %eax, vi32(%rip) +; CHECK: retq +} + +define void @TestFPToUIF128_U32() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptoui fp128 %0 to i32 + store i32 %conv, i32* @vu32, align 4 + ret void +; CHECK-LABEL: TestFPToUIF128_U32: +; CHECK: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __fixunstfsi +; CHECK-NEXT: movl %eax, vu32(%rip) +; CHECK: retq +} + +define void @TestFPToSIF128_I64() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptosi fp128 %0 to i32 + %conv1 = sext i32 %conv to i64 + store i64 %conv1, i64* @vi64, align 8 + ret void +; CHECK-LABEL: TestFPToSIF128_I64: +; CHECK: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __fixtfsi +; CHECK-NEXT: cltq +; CHECK-NEXT: movq %rax, vi64(%rip) +; CHECK: retq +} + +define void @TestFPToUIF128_U64() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptoui fp128 %0 to i32 + %conv1 = zext i32 %conv to i64 + store i64 %conv1, i64* @vu64, align 8 + ret void +; CHECK-LABEL: TestFPToUIF128_U64: +; CHECK: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __fixunstfsi +; CHECK-NEXT: movl %eax, %eax +; CHECK-NEXT: movq %rax, vu64(%rip) +; CHECK: retq +} + +define void @TestFPTruncF128_F32() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptrunc fp128 %0 to float + store float %conv, float* @vf32, align 4 + ret void +; CHECK-LABEL: TestFPTruncF128_F32: +; CHECK: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __trunctfsf2 +; CHECK-NEXT: movss %xmm0, vf32(%rip) +; CHECK: retq +} + +define void @TestFPTruncF128_F64() { +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = fptrunc fp128 %0 to double + store double %conv, double* @vf64, align 8 + ret void +; CHECK-LABEL: TestFPTruncF128_F64: +; CHECK: movapd vf128(%rip), %xmm0 +; CHECK-NEXT: callq __trunctfdf2 +; CHECK-NEXT: movsd %xmm0, vf64(%rip) +; CHECK: retq +} + +define void @TestSIToFPI32_F128() { +entry: + %0 = load i32, i32* @vi32, align 4 + %conv = sitofp i32 %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestSIToFPI32_F128: +; CHECK: movl vi32(%rip), %edi +; CHECK-NEXT: callq __floatsitf +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @TestUIToFPU32_F128() #2 { +entry: + %0 = load i32, i32* @vu32, align 4 + %conv = uitofp i32 %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestUIToFPU32_F128: +; CHECK: movl vu32(%rip), %edi +; CHECK-NEXT: callq __floatunsitf +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @TestSIToFPI64_F128(){ +entry: + %0 = load i64, i64* @vi64, align 8 + %conv = sitofp i64 %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestSIToFPI64_F128: +; CHECK: movq vi64(%rip), %rdi +; CHECK-NEXT: callq __floatditf +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @TestUIToFPU64_F128() #2 { +entry: + %0 = load i64, i64* @vu64, align 8 + %conv = uitofp i64 %0 to fp128 + store fp128 %conv, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: TestUIToFPU64_F128: +; CHECK: movq vu64(%rip), %rdi +; CHECK-NEXT: callq __floatunditf +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define i32 @TestConst128(fp128 %v) { +entry: + %cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestConst128: +; CHECK: movaps {{.*}}, %xmm1 +; CHECK-NEXT: callq __gttf2 +; CHECK-NEXT: test +; CHECK: retq +} + +; C code: +; struct TestBits_ieee_ext { +; unsigned v1; +; unsigned v2; +; }; +; union TestBits_LDU { +; FP128 ld; +; struct TestBits_ieee_ext bits; +; }; +; int TestBits128(FP128 ld) { +; union TestBits_LDU u; +; u.ld = ld * ld; +; return ((u.bits.v1 | u.bits.v2) == 0); +; } +define i32 @TestBits128(fp128 %ld) { +entry: + %mul = fmul fp128 %ld, %ld + %0 = bitcast fp128 %mul to i128 + %shift = lshr i128 %0, 32 + %or5 = or i128 %shift, %0 + %or = trunc i128 %or5 to i32 + %cmp = icmp eq i32 %or, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestBits128: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: callq __multf3 +; CHECK-NEXT: movaps %xmm0, (%rsp) +; CHECK-NEXT: movq (%rsp), +; CHECK-NEXT: movq % +; CHECK-NEXT: shrq $32, +; CHECK: orl +; CHECK-NEXT: sete %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK: retq +; +; If TestBits128 fails due to any llvm or clang change, +; please make sure the original simplified C code will +; be compiled into correct IL and assembly code, not +; just this TestBits128 test case. Better yet, try to +; test the whole libm and its test cases. +} + +; C code: (compiled with -target x86_64-linux-android) +; typedef long double __float128; +; __float128 TestPair128(unsigned long a, unsigned long b) { +; unsigned __int128 n; +; unsigned __int128 v1 = ((unsigned __int128)a << 64); +; unsigned __int128 v2 = (unsigned __int128)b; +; n = (v1 | v2) + 3; +; return *(__float128*)&n; +; } +define fp128 @TestPair128(i64 %a, i64 %b) { +entry: + %conv = zext i64 %a to i128 + %shl = shl nuw i128 %conv, 64 + %conv1 = zext i64 %b to i128 + %or = or i128 %shl, %conv1 + %add = add i128 %or, 3 + %0 = bitcast i128 %add to fp128 + ret fp128 %0 +; CHECK-LABEL: TestPair128: +; CHECK: addq $3, %rsi +; CHECK: movq %rsi, -24(%rsp) +; CHECK: movq %rdi, -16(%rsp) +; CHECK: movaps -24(%rsp), %xmm0 +; CHECK-NEXT: retq +} + +define fp128 @TestTruncCopysign(fp128 %x, i32 %n) { +entry: + %cmp = icmp sgt i32 %n, 50000 + br i1 %cmp, label %if.then, label %cleanup + +if.then: ; preds = %entry + %conv = fptrunc fp128 %x to double + %call = tail call double @copysign(double 0x7FF0000000000000, double %conv) #2 + %conv1 = fpext double %call to fp128 + br label %cleanup + +cleanup: ; preds = %entry, %if.then + %retval.0 = phi fp128 [ %conv1, %if.then ], [ %x, %entry ] + ret fp128 %retval.0 +; CHECK-LABEL: TestTruncCopysign: +; CHECK: callq __trunctfdf2 +; CHECK-NEXT: andpd {{.*}}, %xmm0 +; CHECK-NEXT: orpd {{.*}}, %xmm0 +; CHECK-NEXT: callq __extenddftf2 +; CHECK: retq +} + +declare double @copysign(double, double) #1 + +attributes #2 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/X86/fp128-compare.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-compare.ll +++ llvm/trunk/test/CodeGen/X86/fp128-compare.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +define i32 @TestComp128GT(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp ogt fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128GT: +; CHECK: callq __gttf2 +; CHECK: setg %al +; CHECK: movzbl %al, %eax +; CHECK: retq +} + +define i32 @TestComp128GE(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp oge fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128GE: +; CHECK: callq __getf2 +; CHECK: testl %eax, %eax +; CHECK: setns %al +; CHECK: movzbl %al, %eax +; CHECK: retq +} + +define i32 @TestComp128LT(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp olt fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128LT: +; CHECK: callq __lttf2 +; CHECK-NEXT: shrl $31, %eax +; CHECK: retq +; +; The 'shrl' is a special optimization in llvm to combine +; the effect of 'fcmp olt' and 'zext'. The main purpose is +; to test soften call to __lttf2. +} + +define i32 @TestComp128LE(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp ole fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128LE: +; CHECK: callq __letf2 +; CHECK-NEXT: testl %eax, %eax +; CHECK: setle %al +; CHECK: movzbl %al, %eax +; CHECK: retq +} + +define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp oeq fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128EQ: +; CHECK: callq __eqtf2 +; CHECK-NEXT: testl %eax, %eax +; CHECK: sete %al +; CHECK: movzbl %al, %eax +; CHECK: retq +} + +define i32 @TestComp128NE(fp128 %d1, fp128 %d2) { +entry: + %cmp = fcmp une fp128 %d1, %d2 + %conv = zext i1 %cmp to i32 + ret i32 %conv +; CHECK-LABEL: TestComp128NE: +; CHECK: callq __netf2 +; CHECK-NEXT: testl %eax, %eax +; CHECK: setne %al +; CHECK: movzbl %al, %eax +; CHECK: retq +} + +define fp128 @TestMax(fp128 %x, fp128 %y) { +entry: + %cmp = fcmp ogt fp128 %x, %y + %cond = select i1 %cmp, fp128 %x, fp128 %y + ret fp128 %cond +; CHECK-LABEL: TestMax: +; CHECK: movaps %xmm1 +; CHECK: movaps %xmm0 +; CHECK: callq __gttf2 +; CHECK: movaps {{.*}}, %xmm0 +; CHECK: testl %eax, %eax +; CHECK: movaps {{.*}}, %xmm0 +; CHECK: retq +} Index: llvm/trunk/test/CodeGen/X86/fp128-i128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-i128.ll +++ llvm/trunk/test/CodeGen/X86/fp128-i128.ll @@ -0,0 +1,320 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; These tests were generated from simplified libm C code. +; When compiled for the x86_64-linux-android target, +; long double is mapped to f128 type that should be passed +; in SSE registers. When the f128 type calling convention +; problem was fixed, old llvm code failed to handle f128 values +; in several f128/i128 type operations. These unit tests hopefully +; will catch regression in any future change in this area. +; To modified or enhance these test cases, please consult libm +; code pattern and compile with -target x86_64-linux-android +; to generate IL. The __float128 keyword if not accepted by +; clang, just define it to "long double". +; + +; typedef long double __float128; +; union IEEEl2bits { +; __float128 e; +; struct { +; unsigned long manl :64; +; unsigned long manh :48; +; unsigned int exp :15; +; unsigned int sign :1; +; } bits; +; struct { +; unsigned long manl :64; +; unsigned long manh :48; +; unsigned int expsign :16; +; } xbits; +; }; + +; C code: +; void foo(__float128 x); +; void TestUnionLD1(__float128 s, unsigned long n) { +; union IEEEl2bits u; +; __float128 w; +; u.e = s; +; u.bits.manh = n; +; w = u.e; +; foo(w); +; } +define void @TestUnionLD1(fp128 %s, i64 %n) #0 { +entry: + %0 = bitcast fp128 %s to i128 + %1 = zext i64 %n to i128 + %bf.value = shl nuw i128 %1, 64 + %bf.shl = and i128 %bf.value, 5192296858534809181786422619668480 + %bf.clear = and i128 %0, -5192296858534809181786422619668481 + %bf.set = or i128 %bf.shl, %bf.clear + %2 = bitcast i128 %bf.set to fp128 + tail call void @foo(fp128 %2) #2 + ret void +; CHECK-LABEL: TestUnionLD1: +; CHECK: movaps %xmm0, -24(%rsp) +; CHECK-NEXT: movq -24(%rsp), %rax +; CHECK-NEXT: movabsq $281474976710655, %rcx +; CHECK-NEXT: andq %rdi, %rcx +; CHECK-NEXT: movabsq $-281474976710656, %rdx +; CHECK-NEXT: andq -16(%rsp), %rdx +; CHECK-NEXT: movq %rax, -40(%rsp) +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rdx, -32(%rsp) +; CHECK-NEXT: movaps -40(%rsp), %xmm0 +; CHECK-NEXT: jmp foo +} + +; C code: +; __float128 TestUnionLD2(__float128 s) { +; union IEEEl2bits u; +; __float128 w; +; u.e = s; +; u.bits.manl = 0; +; w = u.e; +; return w; +; } +define fp128 @TestUnionLD2(fp128 %s) #0 { +entry: + %0 = bitcast fp128 %s to i128 + %bf.clear = and i128 %0, -18446744073709551616 + %1 = bitcast i128 %bf.clear to fp128 + ret fp128 %1 +; CHECK-LABEL: TestUnionLD2: +; CHECK: movaps %xmm0, -24(%rsp) +; CHECK-NEXT: movq -16(%rsp), %rax +; CHECK-NEXT: movq %rax, -32(%rsp) +; CHECK-NEXT: movq $0, -40(%rsp) +; CHECK-NEXT: movaps -40(%rsp), %xmm0 +; CHECK-NEXT: retq +} + +; C code: +; __float128 TestI128_1(__float128 x) +; { +; union IEEEl2bits z; +; z.e = x; +; z.bits.sign = 0; +; return (z.e < 0.1L) ? 1.0L : 2.0L; +; } +define fp128 @TestI128_1(fp128 %x) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %bf.clear = and i128 %0, 170141183460469231731687303715884105727 + %1 = bitcast i128 %bf.clear to fp128 + %cmp = fcmp olt fp128 %1, 0xL999999999999999A3FFB999999999999 + %cond = select i1 %cmp, fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000004000000000000000 + ret fp128 %cond +; CHECK-LABEL: TestI128_1: +; CHECK: movaps %xmm0, +; CHECK: movabsq $9223372036854775807, +; CHECK: callq __lttf2 +; CHECK: testl %eax, %eax +; CHECK: movaps {{.*}}, %xmm0 +; CHECK: retq +} + +; C code: +; __float128 TestI128_2(__float128 x, __float128 y) +; { +; unsigned short hx; +; union IEEEl2bits ge_u; +; ge_u.e = x; +; hx = ge_u.xbits.expsign; +; return (hx & 0x8000) == 0 ? x : y; +; } +define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %cmp = icmp sgt i128 %0, -1 + %cond = select i1 %cmp, fp128 %x, fp128 %y + ret fp128 %cond +; CHECK-LABEL: TestI128_2: +; CHECK: movaps %xmm0, -24(%rsp) +; CHECK-NEXT: cmpq $0, -16(%rsp) +; CHECK-NEXT: jns +; CHECK: movaps %xmm1, %xmm0 +; CHECK: retq +} + +; C code: +; __float128 TestI128_3(__float128 x, int *ex) +; { +; union IEEEl2bits u; +; u.e = x; +; if (u.bits.exp == 0) { +; u.e *= 0x1.0p514; +; u.bits.exp = 0x3ffe; +; } +; return (u.e); +; } +define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %bf.cast = and i128 %0, 170135991163610696904058773219554885632 + %cmp = icmp eq i128 %bf.cast, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %mul = fmul fp128 %x, 0xL00000000000000004201000000000000 + %1 = bitcast fp128 %mul to i128 + %bf.clear4 = and i128 %1, -170135991163610696904058773219554885633 + %bf.set = or i128 %bf.clear4, 85060207136517546210586590865283612672 + br label %if.end + +if.end: ; preds = %if.then, %entry + %u.sroa.0.0 = phi i128 [ %bf.set, %if.then ], [ %0, %entry ] + %2 = bitcast i128 %u.sroa.0.0 to fp128 + ret fp128 %2 +; CHECK-LABEL: TestI128_3: +; CHECK: movaps %xmm0, +; CHECK: movabsq $9223090561878065152, +; CHECK: testq +; CHECK: callq __multf3 +; CHECK-NEXT: movaps %xmm0 +; CHECK: movabsq $-9223090561878065153, +; CHECK: movabsq $4611123068473966592, +; CHECK: retq +} + +; C code: +; __float128 TestI128_4(__float128 x) +; { +; union IEEEl2bits u; +; __float128 df; +; u.e = x; +; u.xbits.manl = 0; +; df = u.e; +; return x + df; +; } +define fp128 @TestI128_4(fp128 %x) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %bf.clear = and i128 %0, -18446744073709551616 + %1 = bitcast i128 %bf.clear to fp128 + %add = fadd fp128 %1, %x + ret fp128 %add +; CHECK-LABEL: TestI128_4: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, 16(%rsp) +; CHECK-NEXT: movq 24(%rsp), %rax +; CHECK-NEXT: movq %rax, 8(%rsp) +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: callq __addtf3 +; CHECK: retq +} + +@v128 = common global i128 0, align 16 +@v128_2 = common global i128 0, align 16 + +; C code: +; unsigned __int128 v128, v128_2; +; void TestShift128_2() { +; v128 = ((v128 << 96) | v128_2); +; } +define void @TestShift128_2() #2 { +entry: + %0 = load i128, i128* @v128, align 16 + %shl = shl i128 %0, 96 + %1 = load i128, i128* @v128_2, align 16 + %or = or i128 %shl, %1 + store i128 %or, i128* @v128, align 16 + ret void +; CHECK-LABEL: TestShift128_2: +; CHECK: movq v128(%rip), %rax +; CHECK-NEXT: shlq $32, %rax +; CHECK-NEXT: movq v128_2(%rip), %rcx +; CHECK-NEXT: orq v128_2+8(%rip), %rax +; CHECK-NEXT: movq %rcx, v128(%rip) +; CHECK-NEXT: movq %rax, v128+8(%rip) +; CHECK-NEXT: retq +} + +define fp128 @acosl(fp128 %x) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %bf.clear = and i128 %0, -18446744073709551616 + %1 = bitcast i128 %bf.clear to fp128 + %add = fadd fp128 %1, %x + ret fp128 %add +; CHECK-LABEL: acosl: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, 16(%rsp) +; CHECK-NEXT: movq 24(%rsp), %rax +; CHECK-NEXT: movq %rax, 8(%rsp) +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: callq __addtf3 +; CHECK: retq +} + +; Compare i128 values and check i128 constants. +define fp128 @TestComp(fp128 %x, fp128 %y) #0 { +entry: + %0 = bitcast fp128 %x to i128 + %cmp = icmp sgt i128 %0, -1 + %cond = select i1 %cmp, fp128 %x, fp128 %y + ret fp128 %cond +; CHECK-LABEL: TestComp: +; CHECK: movaps %xmm0, -24(%rsp) +; CHECK-NEXT: cmpq $0, -16(%rsp) +; CHECK-NEXT: jns +; CHECK: movaps %xmm1, %xmm0 +; CHECK: retq +} + +declare void @foo(fp128) #1 + +; Test logical operations on fp128 values. +define fp128 @TestFABS_LD(fp128 %x) #0 { +entry: + %call = tail call fp128 @fabsl(fp128 %x) #2 + ret fp128 %call +; CHECK-LABEL: TestFABS_LD +; CHECK: andps {{.*}}, %xmm0 +; CHECK-NEXT: retq +} + +declare fp128 @fabsl(fp128) #1 + +declare fp128 @copysignl(fp128, fp128) #1 + +; Test more complicated logical operations generated from copysignl. +define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 { +entry: + %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0 + %z.real = load fp128, fp128* %z.realp, align 16 + %z.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 1 + %z.imag4 = load fp128, fp128* %z.imagp, align 16 + %cmp = fcmp ogt fp128 %z.real, %z.imag4 + %sub = fsub fp128 %z.imag4, %z.imag4 + br i1 %cmp, label %if.then, label %cleanup + +if.then: ; preds = %entry + %call = tail call fp128 @fabsl(fp128 %sub) #2 + br label %cleanup + +cleanup: ; preds = %entry, %if.then + %z.real.sink = phi fp128 [ %z.real, %if.then ], [ %sub, %entry ] + %call.sink = phi fp128 [ %call, %if.then ], [ %z.real, %entry ] + %call5 = tail call fp128 @copysignl(fp128 %z.real.sink, fp128 %z.imag4) #2 + %0 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 0 + %1 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 1 + store fp128 %call.sink, fp128* %0, align 16 + store fp128 %call5, fp128* %1, align 16 + ret void +; CHECK-LABEL: TestCopySign +; CHECK-NOT: call +; CHECK: callq __subtf3 +; CHECK-NOT: call +; CHECK: callq __gttf2 +; CHECK-NOT: call +; CHECK: andps {{.*}}, %xmm0 +; CHECK: retq +} + + +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/X86/fp128-libcalls.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-libcalls.ll +++ llvm/trunk/test/CodeGen/X86/fp128-libcalls.ll @@ -0,0 +1,107 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; Check all soft floating point library function calls. + +@vf64 = common global double 0.000000e+00, align 8 +@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16 + +define void @Test128Add(fp128 %d1, fp128 %d2) { +entry: + %add = fadd fp128 %d1, %d2 + store fp128 %add, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128Add: +; CHECK: callq __addtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128_1Add(fp128 %d1){ +entry: + %0 = load fp128, fp128* @vf128, align 16 + %add = fadd fp128 %0, %d1 + store fp128 %add, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128_1Add: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __addtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128Sub(fp128 %d1, fp128 %d2){ +entry: + %sub = fsub fp128 %d1, %d2 + store fp128 %sub, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128Sub: +; CHECK: callq __subtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128_1Sub(fp128 %d1){ +entry: + %0 = load fp128, fp128* @vf128, align 16 + %sub = fsub fp128 %0, %d1 + store fp128 %sub, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128_1Sub: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __subtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128Mul(fp128 %d1, fp128 %d2){ +entry: + %mul = fmul fp128 %d1, %d2 + store fp128 %mul, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128Mul: +; CHECK: callq __multf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128_1Mul(fp128 %d1){ +entry: + %0 = load fp128, fp128* @vf128, align 16 + %mul = fmul fp128 %0, %d1 + store fp128 %mul, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128_1Mul: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __multf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128Div(fp128 %d1, fp128 %d2){ +entry: + %div = fdiv fp128 %d1, %d2 + store fp128 %div, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128Div: +; CHECK: callq __divtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} + +define void @Test128_1Div(fp128 %d1){ +entry: + %0 = load fp128, fp128* @vf128, align 16 + %div = fdiv fp128 %0, %d1 + store fp128 %div, fp128* @vf128, align 16 + ret void +; CHECK-LABEL: Test128_1Div: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps vf128(%rip), %xmm0 +; CHECK-NEXT: callq __divtf3 +; CHECK-NEXT: movaps %xmm0, vf128(%rip) +; CHECK: retq +} Index: llvm/trunk/test/CodeGen/X86/fp128-load.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-load.ll +++ llvm/trunk/test/CodeGen/X86/fp128-load.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; __float128 myFP128 = 1.0L; // x86_64-linux-android +@my_fp128 = global fp128 0xL00000000000000003FFF000000000000, align 16 + +define fp128 @get_fp128() { +entry: + %0 = load fp128, fp128* @my_fp128, align 16 + ret fp128 %0 +; CHECK-LABEL: get_fp128: +; CHECK: movaps my_fp128(%rip), %xmm0 +; CHECK-NEXT: retq +} + +@TestLoadExtend.data = internal unnamed_addr constant [2 x float] [float 0x3FB99999A0000000, float 0x3FC99999A0000000], align 4 + +define fp128 @TestLoadExtend(fp128 %x, i32 %n) { +entry: + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom + %0 = load float, float* %arrayidx, align 4 + %conv = fpext float %0 to fp128 + ret fp128 %conv +; CHECK-LABEL: TestLoadExtend: +; CHECK: movslq %edi, %rax +; CHECK-NEXT: movss TestLoadExtend.data(,%rax,4), %xmm0 +; CHECK-NEXT: callq __extendsftf2 +; CHECK: retq +} + +; CHECK-LABEL: my_fp128: +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad 4611404543450677248 +; CHECK-NEXT: .size my_fp128, 16 Index: llvm/trunk/test/CodeGen/X86/fp128-store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fp128-store.ll +++ llvm/trunk/test/CodeGen/X86/fp128-store.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s + +; __float128 myFP128 = 1.0L; // x86_64-linux-android +@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16 + +define void @set_FP128(fp128 %x) { +entry: + store fp128 %x, fp128* @myFP128, align 16 + ret void +; CHECK-LABEL: set_FP128: +; CHECK: movaps %xmm0, myFP128(%rip) +; CHECK-NEXT: retq +} Index: llvm/trunk/test/CodeGen/X86/soft-fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/soft-fp.ll +++ llvm/trunk/test/CodeGen/X86/soft-fp.ll @@ -1,8 +1,14 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,+soft-float | FileCheck %s -; RUN: llc < %s -march=x86-64 -mattr=+sse2,+soft-float | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+sse2,+soft-float | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse,+soft-float \ +; RUN: | FileCheck %s --check-prefix=SOFT1 --check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2,+soft-float \ +; RUN: | FileCheck %s --check-prefix=SOFT2 --check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse \ +; RUN: | FileCheck %s --check-prefix=SSE1 --check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 \ +; RUN: | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+mmx,+sse2,+soft-float | FileCheck %s -; CHECK-NOT: xmm{[0-9]+} +; CHECK-NOT: xmm{{[0-9]+}} %struct.__va_list_tag = type { i32, i32, i8*, i8* } @@ -15,6 +21,8 @@ call void @bar(%struct.__va_list_tag* %va3) nounwind call void @llvm.va_end(i8* %va12) ret i32 undef +; CHECK-LABEL: t1: +; CHECK: ret{{[lq]}} } declare void @llvm.va_start(i8*) nounwind @@ -27,4 +35,23 @@ entry: %0 = fadd float %a, %b ; [#uses=1] ret float %0 +; CHECK-LABEL: t2: +; SOFT1-NOT: xmm{{[0-9]+}} +; SOFT2-NOT: xmm{{[0-9]+}} +; SSE1: xmm{{[0-9]+}} +; SSE2: xmm{{[0-9]+}} +; CHECK: ret{{[lq]}} +} + +; soft-float means no SSE instruction and passing fp128 as pair of i64. +define fp128 @t3(fp128 %a, fp128 %b) nounwind readnone { +entry: + %0 = fadd fp128 %b, %a + ret fp128 %0 +; CHECK-LABEL: t3: +; SOFT1-NOT: xmm{{[0-9]+}} +; SOFT2-NOT: xmm{{[0-9]+}} +; SSE1: xmm{{[0-9]+}} +; SSE2: xmm{{[0-9]+}} +; CHECK: ret{{[lq]}} } Index: llvm/trunk/utils/TableGen/X86RecognizableInstr.cpp =================================================================== --- llvm/trunk/utils/TableGen/X86RecognizableInstr.cpp +++ llvm/trunk/utils/TableGen/X86RecognizableInstr.cpp @@ -951,6 +951,7 @@ TYPE("f128mem", TYPE_M128) TYPE("f256mem", TYPE_M256) TYPE("f512mem", TYPE_M512) + TYPE("FR128", TYPE_XMM128) TYPE("FR64", TYPE_XMM64) TYPE("FR64X", TYPE_XMM64) TYPE("f64mem", TYPE_M64FP) @@ -1069,6 +1070,7 @@ // register IDs in 8-bit immediates nowadays. ENCODING("FR32", ENCODING_IB) ENCODING("FR64", ENCODING_IB) + ENCODING("FR128", ENCODING_IB) ENCODING("VR128", ENCODING_IB) ENCODING("VR256", ENCODING_IB) ENCODING("FR32X", ENCODING_IB) @@ -1091,6 +1093,7 @@ ENCODING("GR8", ENCODING_RM) ENCODING("VR128", ENCODING_RM) ENCODING("VR128X", ENCODING_RM) + ENCODING("FR128", ENCODING_RM) ENCODING("FR64", ENCODING_RM) ENCODING("FR32", ENCODING_RM) ENCODING("FR64X", ENCODING_RM) @@ -1120,6 +1123,7 @@ ENCODING("GR64", ENCODING_REG) ENCODING("GR8", ENCODING_REG) ENCODING("VR128", ENCODING_REG) + ENCODING("FR128", ENCODING_REG) ENCODING("FR64", ENCODING_REG) ENCODING("FR32", ENCODING_REG) ENCODING("VR64", ENCODING_REG) @@ -1157,6 +1161,7 @@ ENCODING("GR32", ENCODING_VVVV) ENCODING("GR64", ENCODING_VVVV) ENCODING("FR32", ENCODING_VVVV) + ENCODING("FR128", ENCODING_VVVV) ENCODING("FR64", ENCODING_VVVV) ENCODING("VR128", ENCODING_VVVV) ENCODING("VR256", ENCODING_VVVV)