diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -21453,6 +21453,42 @@ first element is undefined. +'``llvm.arithmetic.fence``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.arithmetic.fence( ) + +Overview: +""""""""" + +The purpose of the ``llvm.arithmetic.fence`` intrinsic +is to prevent the optimizer from performaing fast-math optimizations, +particularly reassociation, +between the argument and the expression that contains the argument. +It can be used to preserve the parentheses in the source language. + +Arguments: +"""""""""" + +The ``llvm.arithmetic.fence`` intrinsic takes only one argument. +The argument and the return value are floating-point numbers, +or vector floating-point numbers, of the same type. + +Semantics: +"""""""""" + +This intrinsic returns the value of its operand. The optimizer can optimize +the argument, but the optimizer cannot hoist any component of the operand +to the containing context, and the optimizer cannot move the calculation of +any expression in the containing context into the operand. + + '``llvm.donothing``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -576,6 +576,7 @@ case Intrinsic::assume: case Intrinsic::sideeffect: case Intrinsic::pseudoprobe: + case Intrinsic::arithmetic_fence: case Intrinsic::dbg_declare: case Intrinsic::dbg_value: case Intrinsic::dbg_label: diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1609,6 +1609,7 @@ case Intrinsic::lifetime_end: case Intrinsic::sideeffect: case Intrinsic::pseudoprobe: + case Intrinsic::arithmetic_fence: return 0; case Intrinsic::masked_store: { Type *Ty = Tys[0]; diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1097,6 +1097,10 @@ /// specifier. PREFETCH, + /// ARITH_FENCE - This corresponds to a arithmetic fence intrinsic. Both its + /// operand and output are the same floating type. + ARITH_FENCE, + /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) /// This corresponds to the fence instruction. It takes an input chain, and /// two integer constants: an AtomicOrdering and a SynchronizationScope. diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -318,6 +318,7 @@ void CannotYetSelect(SDNode *N); void Select_FREEZE(SDNode *N); + void Select_ARITH_FENCE(SDNode *N); private: void DoInstructionSelection(); diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -905,6 +905,13 @@ return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name); } + /// Create a call to the arithmetic_fence intrinsic. + CallInst *CreateArithmeticFence(Value *Val, Type *DstType, + const Twine &Name = "") { + return CreateIntrinsic(Intrinsic::arithmetic_fence, DstType, Val, nullptr, + Name); + } + /// Create a call to the experimental.vector.extract intrinsic. CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name = "") { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1335,6 +1335,9 @@ def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrWillReturn]>; +// Arithmetic fence intrinsic. +def int_arithmetic_fence : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + // Intrinsics to support half precision floating point format let IntrProperties = [IntrNoMem, IntrWillReturn] in { def int_convert_to_fp16 : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>; diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -121,6 +121,9 @@ /// Pseudo probe HANDLE_TARGET_OPCODE(PSEUDO_PROBE) +/// Arithmetic fence. +HANDLE_TARGET_OPCODE(ARITH_FENCE) + /// A Stackmap instruction captures the location of live variables at its /// position in the instruction stream. It is followed by a shadow of bytes /// that must lie within the function and not contain another stackmap. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1176,6 +1176,13 @@ let AsmString = "PSEUDO_PROBE"; let hasSideEffects = 1; } +def ARITH_FENCE : StandardPseudoInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins unknown:$src); + let AsmString = ""; + let hasSideEffects = false; + let Constraints = "$src = $dst"; +} def STACKMAP : StandardPseudoInstruction { let OutOperandList = (outs); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1332,6 +1332,10 @@ case TargetOpcode::PSEUDO_PROBE: emitPseudoProbe(MI); break; + case TargetOpcode::ARITH_FENCE: + if (isVerbose()) + OutStreamer->emitRawComment("ARITH_FENCE"); + break; default: emitInstruction(&MI); if (CanDoExtraAnalysis) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -90,6 +90,7 @@ case ISD::FNEARBYINT: case ISD::FNEG: case ISD::FREEZE: + case ISD::ARITH_FENCE: case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: @@ -983,6 +984,7 @@ case ISD::FNEARBYINT: case ISD::FNEG: case ISD::FREEZE: + case ISD::ARITH_FENCE: case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FP_TO_SINT: @@ -3146,6 +3148,7 @@ case ISD::CTTZ_ZERO_UNDEF: case ISD::FNEG: case ISD::FREEZE: + case ISD::ARITH_FENCE: case ISD::FCANONICALIZE: Res = WidenVecRes_Unary(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6292,6 +6292,12 @@ getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags)); return; + case Intrinsic::arithmetic_fence: { + setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), Flags)); + return; + } case Intrinsic::fma: setValue(&I, DAG.getNode( ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2325,6 +2325,11 @@ N->getOperand(0)); } +void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::ARITH_FENCE, N->getValueType(0), + N->getOperand(0)); +} + /// GetVBR - decode a vbr encoding whose top bit is set. LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) { @@ -2876,6 +2881,9 @@ case ISD::FREEZE: Select_FREEZE(NodeToMatch); return; + case ISD::ARITH_FENCE: + Select_ARITH_FENCE(NodeToMatch); + return; } assert(!NodeToMatch->isMachineOpcode() && "Node already selected!"); diff --git a/llvm/test/CodeGen/X86/arithmetic_fence.ll b/llvm/test/CodeGen/X86/arithmetic_fence.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/arithmetic_fence.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64 + +define float @f1(float %a, float %b, float %c) { +; X86-LABEL: f1: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem +; X86-NEXT: vmovss %xmm1, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: f1: +; X64: # %bb.0: +; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; X64-NEXT: retq + %mul = fmul fast float %b, %a + %add = fadd fast float %mul, %c + ret float %add +} + +define float @f2(float %a, float %b, float %c) { +; X86-LABEL: f2: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: f2: +; X64: # %bb.0: +; X64-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; X64-NEXT: retq + %mul = fmul fast float %b, %a + %tmp = call float @llvm.arithmetic.fence.f32(float %mul) + %add = fadd fast float %tmp, %c + ret float %add +} + +define double @f3(double %a) { +; X86-LABEL: f3: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: f3: +; X64: # %bb.0: +; X64-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %1 = fadd fast double %a, %a + %2 = fadd fast double %a, %a + %3 = fadd fast double %1, %2 + ret double %3 +} + +define double @f4(double %a) { +; X86-LABEL: f4: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovapd %xmm0, %xmm1 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: f4: +; X64: # %bb.0: +; X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovapd %xmm0, %xmm1 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; X64-NEXT: retq + %1 = fadd fast double %a, %a + %t = call double @llvm.arithmetic.fence.f64(double %1) + %2 = fadd fast double %a, %a + %3 = fadd fast double %t, %2 + ret double %3 +} + +define <2 x float> @f5(<2 x float> %a) { +; X86-LABEL: f5: +; X86: # %bb.0: +; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: f5: +; X64: # %bb.0: +; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: retq + %1 = fadd fast <2 x float> %a, %a + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %1, %2 + ret <2 x float> %3 +} + +define <2 x float> @f6(<2 x float> %a) { +; X86-LABEL: f6: +; X86: # %bb.0: +; X86-NEXT: vaddps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: f6: +; X64: # %bb.0: +; X64-NEXT: vaddps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, %xmm1 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; X64-NEXT: retq + %1 = fadd fast <2 x float> %a, %a + %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %t, %2 + ret <2 x float> %3 +} + +declare float @llvm.arithmetic.fence.f32(float) +declare double @llvm.arithmetic.fence.f64(double) +declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>) diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 + +define double @f1(double %a) { +; X86-LABEL: f1: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: f1: +; X64: # %bb.0: +; X64-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: retq + %1 = fadd fast double %a, %a + %2 = fadd fast double %a, %a + %3 = fadd fast double %1, %2 + ret double %3 +} + +define double @f2(double %a) { +; X86-LABEL: f2: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: addsd %xmm0, %xmm0 +; X86-NEXT: movapd %xmm0, %xmm1 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: addsd %xmm0, %xmm1 +; X86-NEXT: movsd %xmm1, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: f2: +; X64: # %bb.0: +; X64-NEXT: addsd %xmm0, %xmm0 +; X64-NEXT: movapd %xmm0, %xmm1 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: addsd %xmm0, %xmm1 +; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: retq + %1 = fadd fast double %a, %a + %t = call double @llvm.arithmetic.fence.f64(double %1) + %2 = fadd fast double %a, %a + %3 = fadd fast double %t, %2 + ret double %3 +} + +define <2 x float> @f3(<2 x float> %a) { +; X86-LABEL: f3: +; X86: # %bb.0: +; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: f3: +; X64: # %bb.0: +; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: retq + %1 = fadd fast <2 x float> %a, %a + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %1, %2 + ret <2 x float> %3 +} + +define <2 x float> @f4(<2 x float> %a) { +; X86-LABEL: f4: +; X86: # %bb.0: +; X86-NEXT: addps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, %xmm1 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: addps %xmm0, %xmm1 +; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: f4: +; X64: # %bb.0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: addps %xmm0, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %1 = fadd fast <2 x float> %a, %a + %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %t, %2 + ret <2 x float> %3 +} + +define <8 x float> @f5(<8 x float> %a) { +; X86-LABEL: f5: +; X86: # %bb.0: +; X86-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X86-NEXT: mulps %xmm2, %xmm0 +; X86-NEXT: mulps %xmm2, %xmm1 +; X86-NEXT: retl +; +; X64-LABEL: f5: +; X64: # %bb.0: +; X64-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X64-NEXT: mulps %xmm2, %xmm0 +; X64-NEXT: mulps %xmm2, %xmm1 +; X64-NEXT: retq + %1 = fadd fast <8 x float> %a, %a + %2 = fadd fast <8 x float> %a, %a + %3 = fadd fast <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <8 x float> @f6(<8 x float> %a) { +; X86-LABEL: f6: +; X86: # %bb.0: +; X86-NEXT: addps %xmm0, %xmm0 +; X86-NEXT: addps %xmm1, %xmm1 +; X86-NEXT: movaps %xmm1, %xmm2 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: movaps %xmm0, %xmm3 +; X86-NEXT: #ARITH_FENCE +; X86-NEXT: addps %xmm0, %xmm3 +; X86-NEXT: addps %xmm1, %xmm2 +; X86-NEXT: movaps %xmm3, %xmm0 +; X86-NEXT: movaps %xmm2, %xmm1 +; X86-NEXT: retl +; +; X64-LABEL: f6: +; X64: # %bb.0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: addps %xmm1, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm2 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: movaps %xmm0, %xmm3 +; X64-NEXT: #ARITH_FENCE +; X64-NEXT: addps %xmm0, %xmm3 +; X64-NEXT: addps %xmm1, %xmm2 +; X64-NEXT: movaps %xmm3, %xmm0 +; X64-NEXT: movaps %xmm2, %xmm1 +; X64-NEXT: retq + %1 = fadd fast <8 x float> %a, %a + %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1) + %2 = fadd fast <8 x float> %a, %a + %3 = fadd fast <8 x float> %t, %2 + ret <8 x float> %3 +} + +declare float @llvm.arithmetic.fence.f32(float) +declare double @llvm.arithmetic.fence.f64(double) +declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>) +declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>)