Index: lib/CodeGen/ScheduleDAGInstrs.cpp =================================================================== --- lib/CodeGen/ScheduleDAGInstrs.cpp +++ lib/CodeGen/ScheduleDAGInstrs.cpp @@ -712,7 +712,6 @@ AAForDep = UseAA ? AA : nullptr; BarrierChain = nullptr; - SUnit *FPBarrierChain = nullptr; this->TrackLaneMasks = TrackLaneMasks; MISUnitMap.clear(); @@ -744,6 +743,14 @@ // done. Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/); + // Track all instructions that may raise floating-point exceptions. + // These do not depend on one other (or normal loads or stores), but + // must not be rescheduled across global barriers. Note that we don't + // really need a "map" here since we don't track those MIs by value; + // using the same Value2SUsMap data type here is simply a matter of + // convenience. + Value2SUsMap FPExceptions; + // Remove any stale debug info; sometimes BuildSchedGraph is called again // without emitting the info from the previous call. DbgValues.clear(); @@ -871,20 +878,24 @@ addBarrierChain(Loads); addBarrierChain(NonAliasStores); addBarrierChain(NonAliasLoads); - - // Add dependency against previous FP barrier and reset FP barrier. - if (FPBarrierChain) - FPBarrierChain->addPredBarrier(BarrierChain); - FPBarrierChain = BarrierChain; + addBarrierChain(FPExceptions); continue; } - // Instructions that may raise FP exceptions depend on each other. + // Instructions that may raise FP exceptions may not be moved + // across any global barriers. if (MI.mayRaiseFPException()) { - if (FPBarrierChain) - FPBarrierChain->addPredBarrier(SU); - FPBarrierChain = SU; + if (BarrierChain) + BarrierChain->addPredBarrier(SU); + + FPExceptions.insert(SU, UnknownValue); + + if (FPExceptions.size() >= HugeRegion) { + LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n";); + Value2SUsMap empty; + reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize()); + } } // If it's not a store or a variant load, we're done. Index: test/CodeGen/SystemZ/fp-strict-alias.ll =================================================================== --- test/CodeGen/SystemZ/fp-strict-alias.ll +++ test/CodeGen/SystemZ/fp-strict-alias.ll @@ -2,138 +2,216 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) -declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) declare float @llvm.sqrt.f32(float) declare void @llvm.s390.sfpc(i32) -; For non-strict operations, we expect the post-RA scheduler to -; separate the two square root instructions on z13. -define void @f1(float %f1, float %f2, float %f3, float %f4, float *%ptr0) { +; The basic assumption of all following tests is that on z13, we never +; want to see two square root instructions directly in a row, so the +; post-RA scheduler will always schedule something else in between +; whenever possible. + +; We can move any FP operation across a (normal) store. + +define void @f1(float %f1, float %f2, float *%ptr1, float *%ptr2) { ; CHECK-LABEL: f1: ; CHECK: sqebr -; CHECK: {{aebr|sebr}} +; CHECK: ste ; CHECK: sqebr +; CHECK: ste ; CHECK: br %r14 - %add = fadd float %f1, %f2 - %sub = fsub float %f3, %f4 - %sqrt1 = call float @llvm.sqrt.f32(float %f2) - %sqrt2 = call float @llvm.sqrt.f32(float %f4) - - %ptr1 = getelementptr float, float *%ptr0, i64 1 - %ptr2 = getelementptr float, float *%ptr0, i64 2 - %ptr3 = getelementptr float, float *%ptr0, i64 3 + %sqrt1 = call float @llvm.sqrt.f32(float %f1) + %sqrt2 = call float @llvm.sqrt.f32(float %f2) - store float %add, float *%ptr0 - store float %sub, float *%ptr1 - store float %sqrt1, float *%ptr2 - store float %sqrt2, float *%ptr3 + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 ret void } -; But for strict operations, this must not happen. -define void @f2(float %f1, float %f2, float %f3, float %f4, float *%ptr0) { +define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) { ; CHECK-LABEL: f2: -; CHECK: {{aebr|sebr}} -; CHECK: {{aebr|sebr}} ; CHECK: sqebr +; CHECK: ste ; CHECK: sqebr +; CHECK: ste ; CHECK: br %r14 - %add = call float @llvm.experimental.constrained.fadd.f32( - float %f1, float %f2, + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, metadata !"round.dynamic", - metadata !"fpexcept.strict") - %sub = call float @llvm.experimental.constrained.fsub.f32( - float %f3, float %f4, + metadata !"fpexcept.ignore") + %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( + float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.ignore") + + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 + + ret void +} + +define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f3: +; CHECK: sqebr +; CHECK: ste +; CHECK: sqebr +; CHECK: ste +; CHECK: br %r14 + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( - float %f2, + float %f1, metadata !"round.dynamic", metadata !"fpexcept.strict") %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( - float %f4, + float %f2, metadata !"round.dynamic", metadata !"fpexcept.strict") - %ptr1 = getelementptr float, float *%ptr0, i64 1 - %ptr2 = getelementptr float, float *%ptr0, i64 2 - %ptr3 = getelementptr float, float *%ptr0, i64 3 + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 - store float %add, float *%ptr0 - store float %sub, float *%ptr1 - store float %sqrt1, float *%ptr2 - store float %sqrt2, float *%ptr3 + ret void +} + + +; We can move a non-strict FP operation or a fpexcept.ignore +; operation even across a volatile store, but not a fpexcept.strict +; operation. + +define void @f4(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f4: +; CHECK: sqebr +; CHECK: ste +; CHECK: sqebr +; CHECK: ste +; CHECK: br %r14 + + %sqrt1 = call float @llvm.sqrt.f32(float %f1) + %sqrt2 = call float @llvm.sqrt.f32(float %f2) + + store volatile float %sqrt1, float *%ptr1 + store volatile float %sqrt2, float *%ptr2 ret void } -; On the other hand, strict operations that use the fpexcept.ignore -; exception behaviour should be scheduled freely. -define void @f3(float %f1, float %f2, float %f3, float %f4, float *%ptr0) { -; CHECK-LABEL: f3: +define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f5: ; CHECK: sqebr -; CHECK: {{aebr|sebr}} +; CHECK: ste ; CHECK: sqebr +; CHECK: ste ; CHECK: br %r14 - %add = call float @llvm.experimental.constrained.fadd.f32( - float %f1, float %f2, + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, metadata !"round.dynamic", metadata !"fpexcept.ignore") - %sub = call float @llvm.experimental.constrained.fsub.f32( - float %f3, float %f4, + %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( + float %f2, metadata !"round.dynamic", metadata !"fpexcept.ignore") + + store volatile float %sqrt1, float *%ptr1 + store volatile float %sqrt2, float *%ptr2 + + ret void +} + +define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f6: +; CHECK: sqebr +; CHECK: sqebr +; CHECK: ste +; CHECK: ste +; CHECK: br %r14 + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", + metadata !"fpexcept.strict") + + store volatile float %sqrt1, float *%ptr1 + store volatile float %sqrt2, float *%ptr2 + + ret void +} + + +; No variant of FP operations can be scheduled across a SPFC. + +define void @f7(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f7: +; CHECK: sqebr +; CHECK: sqebr +; CHECK: ste +; CHECK: ste +; CHECK: br %r14 + + %sqrt1 = call float @llvm.sqrt.f32(float %f1) + %sqrt2 = call float @llvm.sqrt.f32(float %f2) + + call void @llvm.s390.sfpc(i32 0) + + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 + + ret void +} + +define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f8: +; CHECK: sqebr +; CHECK: sqebr +; CHECK: ste +; CHECK: ste +; CHECK: br %r14 + + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", metadata !"fpexcept.ignore") %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( - float %f4, + float %f2, metadata !"round.dynamic", metadata !"fpexcept.ignore") - %ptr1 = getelementptr float, float *%ptr0, i64 1 - %ptr2 = getelementptr float, float *%ptr0, i64 2 - %ptr3 = getelementptr float, float *%ptr0, i64 3 + call void @llvm.s390.sfpc(i32 0) - store float %add, float *%ptr0 - store float %sub, float *%ptr1 - store float %sqrt1, float *%ptr2 - store float %sqrt2, float *%ptr3 + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 ret void } -; However, even non-strict operations must not be scheduled across an SFPC. -define void @f4(float %f1, float %f2, float %f3, float %f4, float *%ptr0) { -; CHECK-LABEL: f4: -; CHECK: {{aebr|sebr}} -; CHECK: {{aebr|sebr}} -; CHECK: sfpc +define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) { +; CHECK-LABEL: f9: ; CHECK: sqebr ; CHECK: sqebr +; CHECK: ste +; CHECK: ste ; CHECK: br %r14 - %add = fadd float %f1, %f2 - %sub = fsub float %f3, %f4 + %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( + float %f1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( + float %f2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + call void @llvm.s390.sfpc(i32 0) - %sqrt1 = call float @llvm.sqrt.f32(float %f2) - %sqrt2 = call float @llvm.sqrt.f32(float %f4) - %ptr1 = getelementptr float, float *%ptr0, i64 1 - %ptr2 = getelementptr float, float *%ptr0, i64 2 - %ptr3 = getelementptr float, float *%ptr0, i64 3 - - store float %add, float *%ptr0 - store float %sub, float *%ptr1 - store float %sqrt1, float *%ptr2 - store float %sqrt2, float *%ptr3 + store float %sqrt1, float *%ptr1 + store float %sqrt2, float *%ptr2 ret void } Index: test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll =================================================================== --- test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -108,8 +108,8 @@ ; S390X-NEXT: ldeb %f3, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI3_2 ; S390X-NEXT: ldeb %f4, 0(%r1) -; S390X-NEXT: ddb %f2, 0(%r2) ; S390X-NEXT: ddbr %f3, %f1 +; S390X-NEXT: ddb %f2, 0(%r2) ; S390X-NEXT: ddbr %f4, %f0 ; S390X-NEXT: std %f4, 16(%r2) ; S390X-NEXT: std %f3, 8(%r2) @@ -659,16 +659,16 @@ define void @constrained_vector_fmul_v3f64(<3 x double>* %a) { ; S390X-LABEL: constrained_vector_fmul_v3f64: ; S390X: # %bb.0: # %entry +; S390X-NEXT: ld %f0, 8(%r2) ; S390X-NEXT: larl %r1, .LCPI13_0 -; S390X-NEXT: ld %f0, 0(%r1) -; S390X-NEXT: ld %f1, 8(%r2) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ld %f2, 16(%r2) -; S390X-NEXT: ldr %f3, %f0 +; S390X-NEXT: mdbr %f0, %f1 +; S390X-NEXT: ldr %f3, %f1 ; S390X-NEXT: mdb %f3, 0(%r2) -; S390X-NEXT: mdbr %f1, %f0 -; S390X-NEXT: mdbr %f2, %f0 +; S390X-NEXT: mdbr %f2, %f1 ; S390X-NEXT: std %f2, 16(%r2) -; S390X-NEXT: std %f1, 8(%r2) +; S390X-NEXT: std %f0, 8(%r2) ; S390X-NEXT: std %f3, 0(%r2) ; S390X-NEXT: br %r14 ; @@ -832,16 +832,16 @@ define void @constrained_vector_fadd_v3f64(<3 x double>* %a) { ; S390X-LABEL: constrained_vector_fadd_v3f64: ; S390X: # %bb.0: # %entry +; S390X-NEXT: ld %f0, 8(%r2) ; S390X-NEXT: larl %r1, .LCPI18_0 -; S390X-NEXT: ld %f0, 0(%r1) -; S390X-NEXT: ld %f1, 8(%r2) +; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: ld %f2, 16(%r2) -; S390X-NEXT: ldr %f3, %f0 +; S390X-NEXT: adbr %f0, %f1 +; S390X-NEXT: ldr %f3, %f1 ; S390X-NEXT: adb %f3, 0(%r2) -; S390X-NEXT: adbr %f1, %f0 -; S390X-NEXT: adbr %f2, %f0 +; S390X-NEXT: adbr %f2, %f1 ; S390X-NEXT: std %f2, 16(%r2) -; S390X-NEXT: std %f1, 8(%r2) +; S390X-NEXT: std %f0, 8(%r2) ; S390X-NEXT: std %f3, 0(%r2) ; S390X-NEXT: br %r14 ; @@ -969,14 +969,14 @@ ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI22_0 ; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: lzer %f1 ; S390X-NEXT: ler %f4, %f0 -; S390X-NEXT: sebr %f4, %f1 ; S390X-NEXT: larl %r1, .LCPI22_1 ; S390X-NEXT: ler %f2, %f0 ; S390X-NEXT: seb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI22_2 ; S390X-NEXT: seb %f0, 0(%r1) +; S390X-NEXT: lzer %f1 +; S390X-NEXT: sebr %f4, %f1 ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fsub_v3f32: