diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2914,6 +2914,9 @@ DAG.getConstant(16, dl, TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout()))); Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op); + // Add fp_extend in case the output is bigger than f32. + if (Node->getValueType(0) != MVT::f32) + Op = DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Op); Results.push_back(Op); break; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2748,39 +2748,47 @@ } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT OVT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); SDLoc dl(N); // Promote to the larger FP type. - Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); - Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); - Op2 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op2); + auto PromotionOpcode = GetPromotionOpcode(OVT, NVT); + Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0); + Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); + Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2); // Convert back to FP16 as an integer. - return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); + return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT OVT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); SDValue Op1 = N->getOperand(1); SDLoc dl(N); - Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + // Promote to the larger FP type. + Op0 = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op0); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1); // Convert back to FP16 as an integer. - return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); + return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) { + EVT RVT = N->getValueType(0); + EVT SVT = N->getOperand(0).getValueType(); + if (N->isStrictFPOpcode()) { + assert(RVT == MVT::f16); SDValue Res = DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other}, {N->getOperand(0), N->getOperand(1)}); @@ -2788,7 +2796,8 @@ return Res; } - return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0)); + return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), MVT::i16, + N->getOperand(0)); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) { @@ -2823,13 +2832,14 @@ } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT OVT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDLoc dl(N); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); // Round the value to the softened type. - return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); + return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) { @@ -2837,33 +2847,36 @@ } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT OVT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); SDLoc dl(N); // Promote to the larger FP type. - Op = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op); + Op = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op); // Convert back to FP16 as an integer. - return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); + return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) { - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT OVT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); SDLoc dl(N); // Promote to the larger FP type. - Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); - Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + auto PromotionOpcode = GetPromotionOpcode(OVT, NVT); + Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0); + Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1); // Convert back to FP16 as an integer. - return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); + return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) { @@ -2947,22 +2960,27 @@ unsigned OpNo) { assert(OpNo == 1 && "Only Operand 1 must need promotion here"); SDValue Op1 = N->getOperand(1); + EVT RVT = Op1.getValueType(); SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType()); Op1 = GetSoftPromotedHalf(Op1); - Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + Op1 = DAG.getNode(GetPromotionOpcode(RVT, NVT), dl, NVT, Op1); return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0), Op1); } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) { + EVT RVT = N->getValueType(0); bool IsStrict = N->isStrictFPOpcode(); - SDValue Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0)); + SDValue Op = N->getOperand(IsStrict ? 1 : 0); + EVT SVT = Op.getValueType(); + Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0)); if (IsStrict) { + assert(SVT == MVT::f16); SDValue Res = DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N), {N->getValueType(0), MVT::Other}, {N->getOperand(0), Op}); @@ -2971,31 +2989,35 @@ return SDValue(); } - return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op); + return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), RVT, Op); } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) { + EVT RVT = N->getValueType(0); SDValue Op = N->getOperand(0); + EVT SVT = Op.getValueType(); SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); Op = GetSoftPromotedHalf(Op); - SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op); + SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op); return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res); } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N) { + EVT RVT = N->getValueType(0); SDValue Op = N->getOperand(0); + EVT SVT = Op.getValueType(); SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); Op = GetSoftPromotedHalf(Op); - SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op); + SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op); return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res, N->getOperand(1)); @@ -3008,14 +3030,16 @@ SDValue Op1 = N->getOperand(1); SDLoc dl(N); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType()); + EVT SVT = Op0.getValueType(); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), SVT); Op0 = GetSoftPromotedHalf(Op0); Op1 = GetSoftPromotedHalf(Op1); // Promote to the larger FP type. - Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); - Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + auto PromotionOpcode = GetPromotionOpcode(SVT, NVT); + Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0); + Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1, N->getOperand(2), N->getOperand(3), N->getOperand(4)); @@ -3027,14 +3051,16 @@ ISD::CondCode CCCode = cast(N->getOperand(2))->get(); SDLoc dl(N); + EVT SVT = Op0.getValueType(); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType()); Op0 = GetSoftPromotedHalf(Op0); Op1 = GetSoftPromotedHalf(Op1); // Promote to the larger FP type. - Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); - Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + auto PromotionOpcode = GetPromotionOpcode(SVT, NVT); + Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0); + Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1385,7 +1385,7 @@ NumRegistersForVT[MVT::bf16] = NumRegistersForVT[MVT::f32]; RegisterTypeForVT[MVT::bf16] = RegisterTypeForVT[MVT::f32]; TransformToType[MVT::bf16] = MVT::f32; - ValueTypeActions.setTypeAction(MVT::bf16, TypePromoteFloat); + ValueTypeActions.setTypeAction(MVT::bf16, TypeSoftPromoteHalf); } // Loop over all of the vector value types to see which need transformations. diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -6,10 +6,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: movzwl (%rsi), %eax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movzwl (%rsi), %eax +; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 @@ -29,10 +29,10 @@ ; CHECK-LABEL: add2: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movd %xmm1, %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 @@ -46,27 +46,31 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind { ; CHECK-LABEL: add_double: ; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rsi, %rbx ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq __truncdfbf2@PLT -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, %ebp ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq __truncdfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: shll $16, %ebp +; CHECK-NEXT: movd %ebp, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: callq __truncsfbf2@PLT +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 ; CHECK-NEXT: movsd %xmm0, (%r14) -; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %la = load double, ptr %pa %a = fptrunc double %la to bfloat @@ -81,21 +85,27 @@ define double @add_double2(double %da, double %db) nounwind { ; CHECK-LABEL: add_double2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: callq __truncdfbf2@PLT -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, %ebx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[0],zero ; CHECK-NEXT: callq __truncdfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: shll $16, %ebx +; CHECK-NEXT: movd %ebx, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: callq __truncsfbf2@PLT +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %a = fptrunc double %da to bfloat %b = fptrunc double %db to bfloat @@ -174,134 +184,135 @@ ; CHECK-LABEL: addv: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: movd %ecx, %xmm2 +; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: movq %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: addss %xmm2, %xmm3 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: addss %xmm2, %xmm3 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: addss %xmm2, %xmm3 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 -; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 -; CHECK-NEXT: movd %ecx, %xmm3 -; CHECK-NEXT: addss %xmm2, %xmm3 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %xmm1, %rdx +; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $48, %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $48, %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-NEXT: movq %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: addss %xmm1, %xmm0 -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: addss %xmm0, %xmm1 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: addss %xmm0, %xmm1 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movq %xmm0, %rbx +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movq %rax, (%rsp) # 8-byte Spill +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; CHECK-NEXT: movq %xmm0, %rbp +; CHECK-NEXT: movq %rbp, %r15 +; CHECK-NEXT: shrq $32, %r15 +; CHECK-NEXT: movq %rbx, %r13 +; CHECK-NEXT: shrq $48, %r13 +; CHECK-NEXT: movq %rbp, %r12 +; CHECK-NEXT: shrq $48, %r12 +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT -; CHECK-NEXT: movd %xmm0, %ebx +; CHECK-NEXT: movd %xmm0, %r14d +; CHECK-NEXT: shll $16, %r14d +; CHECK-NEXT: shll $16, %ebp +; CHECK-NEXT: movd %ebp, %xmm1 ; CHECK-NEXT: shll $16, %ebx -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd %ebx, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %r14d -; CHECK-NEXT: orl %ebx, %r14d -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movzwl %ax, %ebx +; CHECK-NEXT: orl %r14d, %ebx +; CHECK-NEXT: shll $16, %r12d +; CHECK-NEXT: movd %r12d, %xmm1 +; CHECK-NEXT: shll $16, %r13d +; CHECK-NEXT: movd %r13d, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %ebp ; CHECK-NEXT: shll $16, %ebp -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: shll $16, %r15d +; CHECK-NEXT: movd %r15d, %xmm1 +; CHECK-NEXT: movq (%rsp), %rax # 8-byte Reload +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %ebx -; CHECK-NEXT: orl %ebp, %ebx -; CHECK-NEXT: shlq $32, %rbx -; CHECK-NEXT: orq %r14, %rbx -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movzwl %ax, %r14d +; CHECK-NEXT: orl %ebp, %r14d +; CHECK-NEXT: shlq $32, %r14 +; CHECK-NEXT: orq %rbx, %r14 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %ebp ; CHECK-NEXT: shll $16, %ebp -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %r14d -; CHECK-NEXT: orl %ebp, %r14d -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movzwl %ax, %ebx +; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %ebp ; CHECK-NEXT: shll $16, %ebp -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: shlq $32, %rax -; CHECK-NEXT: orq %r14, %rax +; CHECK-NEXT: orq %rbx, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movq %rbx, %xmm1 +; CHECK-NEXT: movq %r14, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $32, %rsp +; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %add = fadd <8 x bfloat> %a, %b