Index: lib/CodeGen/ScheduleDAGInstrs.cpp =================================================================== --- lib/CodeGen/ScheduleDAGInstrs.cpp +++ lib/CodeGen/ScheduleDAGInstrs.cpp @@ -234,6 +234,11 @@ // Ask the target if address-backscheduling is desirable, and if so how much. const TargetSubtargetInfo &ST = MF.getSubtarget(); + // Only use any non-zero latency for real defs/uses, in contrast to + // "fake" operands added by regalloc. + const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc(); + bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() && + !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg())); for (MCRegAliasIterator Alias(MO.getReg(), TRI, true); Alias.isValid(); ++Alias) { if (!Uses.contains(*Alias)) @@ -257,11 +262,18 @@ Dep = SDep(SU, SDep::Data, *Alias); RegUse = UseSU->getInstr(); } - Dep.setLatency( - SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse, - UseOp)); + const MCInstrDesc *UseMIDesc = + (RegUse ? &UseSU->getInstr()->getDesc() : nullptr); + bool ImplicitPseudoUse = + (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) && + !UseMIDesc->hasImplicitUseOfPhysReg(*Alias)); + if (!ImplicitPseudoDef && !ImplicitPseudoUse) { + Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, + RegUse, UseOp)); + ST.adjustSchedDependency(SU, UseSU, Dep); + } else + Dep.setLatency(0); - ST.adjustSchedDependency(SU, UseSU, Dep); UseSU->addPred(Dep); } } Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -61,11 +61,11 @@ ; MESA-DAG: s_mov_b64 s[0:1], s[36:37] +; GCN: v_mov_b32_e32 v0, 1{{$}} +; MESA-DAG: s_mov_b64 s[2:3], s[38:39] ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4 -; GCN-DAG: v_mov_b32_e32 v0, 1{{$}} -; MESA-DAG: s_mov_b64 s[2:3], s[38:39] ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -123,12 +123,12 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN: v_mov_b32_e32 v0, 0x7b +; HSA-DAG: s_mov_b32 s4, s33{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 -; GCN-NEXT: v_mov_b32_e32 v0, 0x7b -; HSA-DAG: s_mov_b32 s4, s33{{$}} ; GCN-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -144,11 +144,11 @@ ; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sbyte v0 +; GCN: s_mov_b32 s4, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s3 ; GCN: s_waitcnt vmcnt(0) @@ -165,11 +165,11 @@ ; HSA-DAG: s_mov_b32 s33, s9{{$}} ; GCN-DAG: buffer_load_ubyte v0 +; GCN: s_mov_b32 s4, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) @@ -197,11 +197,11 @@ ; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sshort v0 +; GCN: s_mov_b32 s4, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) @@ -218,11 +218,11 @@ ; GCN-DAG: buffer_load_ushort v0 +; GCN: s_mov_b32 s4, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) @@ -237,11 +237,11 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN: v_mov_b32_e32 v0, 42 +; GCN: s_mov_b32 s4, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 -; GCN: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -481,10 +481,10 @@ ; HSA-DAG: s_mov_b32 s33, s9 ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-NOT: v3 ; GCN-DAG: v_mov_b32_e32 v0, 3 ; GCN-DAG: v_mov_b32_e32 v1, 4 ; GCN-DAG: v_mov_b32_e32 v2, 5 -; GCN-NOT: v3 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -6,10 +6,10 @@ ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 s33, s7 -; GCN: s_getpc_b64 s[34:35] +; GCN: s_mov_b32 s4, s33 +; GCN-NEXT: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s[34:35] @@ -129,13 +129,13 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: ; GCN: s_mov_b32 s34, s9 -; GCN: ; def s33 -; GCN-NEXT: #ASMEND -; GCN: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s4, s34 -; GCN-NEXT: s_mov_b32 s32, s34 +; GCN: s_mov_b32 s4, s34 +; GCN-DAG: s_mov_b32 s32, s34 +; GCN-DAG: ; def s33 +; GCN-DAG: #ASMEND +; GCN-DAG: s_getpc_b64 s[6:7] +; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 +; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s33 @@ -150,13 +150,13 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: ; GCN: s_mov_b32 s33, s9 -; GCN: ; def v32 -; GCN-NEXT: #ASMEND -; GCN: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: ; def v32 +; GCN-DAG: #ASMEND +; GCN-DAG: s_getpc_b64 s[6:7] +; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 +; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v32 @@ -183,10 +183,10 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: ; GCN: s_mov_b32 s33, s7 -; GCN: s_getpc_b64 +; GCN: s_mov_b32 s4, s33 +; GCN-NEXT: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -558,7 +558,8 @@ ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x400 + +; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15 Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -386,9 +386,9 @@ ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] -; GCN: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[3]+]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: ; GCN-NEXT: s_waitcnt vmcnt(0) Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -186,8 +186,8 @@ ; FIXME: Should not have intermediate sgprs ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr: -; CHECK: s_mov_b32 s1, 0 -; CHECK: s_mov_b32 s0, 0x1e240 +; CHECK-DAG: s_mov_b32 s1, 0 +; CHECK-DAG: s_mov_b32 s0, 0x1e240 ; CHECK: v_mov_b32_e32 v0, s0 ; CHECK: v_mov_b32_e32 v1, s1 ; CHECK: use v[0:1] Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -352,7 +352,7 @@ ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0 +; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 ; Increment to next element folded into base register, but FileCheck ; can't do math expressions Index: test/CodeGen/AMDGPU/misched-killflags.mir =================================================================== --- test/CodeGen/AMDGPU/misched-killflags.mir +++ test/CodeGen/AMDGPU/misched-killflags.mir @@ -26,20 +26,20 @@ S_ENDPGM ... # CHECK-LABEL: name: func0 -# CHECK: $sgpr10 = S_MOV_B32 5 -# CHECK: $sgpr9 = S_MOV_B32 4 -# CHECK: $sgpr8 = S_MOV_B32 3 -# CHECK: $sgpr33 = S_MOV_B32 killed $sgpr7 +# CHECK-DAG: $sgpr10 = S_MOV_B32 5 +# CHECK-DAG: $sgpr9 = S_MOV_B32 4 +# CHECK-DAG: $sgpr8 = S_MOV_B32 3 +# CHECK-DAG: $sgpr33 = S_MOV_B32 killed $sgpr7 # CHECK: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr8_sgpr9_sgpr10_sgpr11 +# CHECK: $sgpr32 = S_MOV_B32 $sgpr33 # CHECK: BUNDLE implicit-def $sgpr6_sgpr7, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $scc { # CHECK: $sgpr6_sgpr7 = S_GETPC_B64 # CHECK: $sgpr6 = S_ADD_U32 internal $sgpr6, 0, implicit-def $scc # CHECK: $sgpr7 = S_ADDC_U32 internal $sgpr7, 0, implicit-def $scc, implicit internal $scc # CHECK: } -# CHECK: $sgpr4 = S_MOV_B32 $sgpr33 +# CHECK: $sgpr4 = S_MOV_B32 killed $sgpr33 # CHECK: $vgpr1 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 # CHECK: $vgpr2 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 # CHECK: $vgpr3 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec -# CHECK: $sgpr32 = S_MOV_B32 killed $sgpr33 # CHECK: S_NOP 0, implicit killed $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 # CHECK: S_ENDPGM Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -33,8 +33,8 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x1400{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset +; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset ; GCN: s_swappc_b64 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}} ; GCN: s_setpc_b64 Index: test/CodeGen/AMDGPU/undefined-subreg-liverange.ll =================================================================== --- test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -57,18 +57,18 @@ ; CHECK-LABEL: {{^}}partially_undef_copy: ; CHECK: v_mov_b32_e32 v5, 5 -; CHECK: v_mov_b32_e32 v6, 6 +; CHECK-DAG: v_mov_b32_e32 v6, 6 -; CHECK: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5 +; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5 ; Undef copy -; CHECK: v_mov_b32_e32 v1, v6 +; CHECK-DAG: v_mov_b32_e32 v1, v6 ; undef copy -; CHECK: v_mov_b32_e32 v2, v7 +; CHECK-DAG: v_mov_b32_e32 v2, v7 -; CHECK: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8 -; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6 +; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8 +; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO]], v6 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}} define amdgpu_kernel void @partially_undef_copy() #0 { Index: test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll =================================================================== --- test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll +++ test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll @@ -19,9 +19,9 @@ ; CHECK-LABEL: isel ; CHECK: push {r4, r5, r6, lr} -; CHECK: movw r12, #0 -; CHECK: movt r12, #0 -; CHECK: movw r4, #{{\d*}} +; CHECK-DAG: movw r12, #0 +; CHECK-DAG: movt r12, #0 +; CHECK-DAG: movw r4, #{{\d*}} ; CHECK: blx r12 ; CHECK: sub.w sp, sp, r4 Index: test/CodeGen/ARM/Windows/chkstk.ll =================================================================== --- test/CodeGen/ARM/Windows/chkstk.ll +++ test/CodeGen/ARM/Windows/chkstk.ll @@ -16,9 +16,9 @@ ; CHECK-DEFAULT-CODE-MODEL: sub.w sp, sp, r4 ; CHECK-LARGE-CODE-MODEL: check_watermark: -; CHECK-LARGE-CODE-MODEL: movw r12, :lower16:__chkstk -; CHECK-LARGE-CODE-MODEL: movt r12, :upper16:__chkstk -; CHECK-LARGE-CODE-MODEL: movw r4, #1024 +; CHECK-LARGE-CODE-MODEL-DAG: movw r12, :lower16:__chkstk +; CHECK-LARGE-CODE-MODEL-DAG: movt r12, :upper16:__chkstk +; CHECK-LARGE-CODE-MODEL-DAG: movw r4, #1024 ; CHECK-LARGE-CODE-MODEL: blx r12 ; CHECK-LARGE-CODE-MODEL: sub.w sp, sp, r4 Index: test/CodeGen/ARM/Windows/memset.ll =================================================================== --- test/CodeGen/ARM/Windows/memset.ll +++ test/CodeGen/ARM/Windows/memset.ll @@ -10,9 +10,9 @@ unreachable } -; CHECK: movw r0, :lower16:source -; CHECK: movt r0, :upper16:source ; CHECK: movs r1, #0 ; CHECK: mov.w r2, #512 +; CHECK: movw r0, :lower16:source +; CHECK: movt r0, :upper16:source ; CHECK: memset Index: test/CodeGen/ARM/arm-and-tst-peephole.ll =================================================================== --- test/CodeGen/ARM/arm-and-tst-peephole.ll +++ test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -163,8 +163,8 @@ ; ; T2-LABEL: test_tst_assessment: ; T2: @ %bb.0: -; T2-NEXT: lsls r1, r1, #31 ; T2-NEXT: and r0, r0, #1 +; T2-NEXT: lsls r1, r1, #31 ; T2-NEXT: it ne ; T2-NEXT: subne r0, #1 ; T2-NEXT: bx lr Index: test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- test/CodeGen/ARM/arm-shrink-wrapping.ll +++ test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -104,10 +104,10 @@ ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1 -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 -; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]] -; THUMB-NEXT: add [[SUM]], [[TMP]] +; ARM: add [[SUM]], [[TMP]], [[SUM]] +; THUMB: add [[SUM]], [[TMP]] +; ARM-NEXT: subs [[IV]], [[IV]], #1 +; THUMB-NEXT: subs [[IV]], #1 ; CHECK-NEXT: bne [[LOOP]] ; ; Next BB. @@ -169,10 +169,10 @@ ; Next BB. ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: @ %for.body ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1 -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 ; ARM: add [[SUM]], [[TMP]], [[SUM]] ; THUMB: add [[SUM]], [[TMP]] +; ARM: subs [[IV]], [[IV]], #1 +; THUMB: subs [[IV]], #1 ; CHECK-NEXT: bne [[LOOP_LABEL]] ; Next BB. ; CHECK: @ %for.exit @@ -228,10 +228,10 @@ ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1 -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 -; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]] -; THUMB-NEXT: add [[SUM]], [[TMP]] +; ARM: add [[SUM]], [[TMP]], [[SUM]] +; THUMB: add [[SUM]], [[TMP]] +; ARM-NEXT: subs [[IV]], [[IV]], #1 +; THUMB-NEXT: subs [[IV]], #1 ; CHECK-NEXT: bne [[LOOP]] ; ; Next BB. @@ -307,10 +307,10 @@ ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1 -; ARM: subs [[IV]], [[IV]], #1 -; THUMB: subs [[IV]], #1 -; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]] -; THUMB-NEXT: add [[SUM]], [[TMP]] +; ARM: add [[SUM]], [[TMP]], [[SUM]] +; THUMB: add [[SUM]], [[TMP]] +; ARM-NEXT: subs [[IV]], [[IV]], #1 +; THUMB-NEXT: subs [[IV]], #1 ; CHECK-NEXT: bne [[LOOP]] ; ; Next BB. Index: test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll =================================================================== --- test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll +++ test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll @@ -18,9 +18,9 @@ ; CHECK-NEXT: Data ; CHECK-SAME: Latency=3 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=3 +; CHECK-SAME: Latency=0 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=4 +; CHECK-SAME: Latency=0 define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize { %1 = load i32, i32* @a, align 4 %2 = load i32, i32* @b, align 4 Index: test/CodeGen/ARM/cortex-a57-misched-ldm.ll =================================================================== --- test/CodeGen/ARM/cortex-a57-misched-ldm.ll +++ test/CodeGen/ARM/cortex-a57-misched-ldm.ll @@ -11,7 +11,7 @@ ; CHECK: Data ; CHECK-SAME: Latency=3 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=3 +; CHECK-SAME: Latency=0 define i32 @foo(i32* %a) nounwind optsize { entry: Index: test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll =================================================================== --- test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll +++ test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll @@ -20,9 +20,9 @@ ; CHECK-NEXT: Data ; CHECK-SAME: Latency=5 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=5 +; CHECK-SAME: Latency=0 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=6 +; CHECK-SAME: Latency=0 define i32 @bar(i32* %iptr) minsize optsize { %1 = load double, double* @a, align 8 %2 = load double, double* @b, align 8 Index: test/CodeGen/ARM/cortex-a57-misched-vldm.ll =================================================================== --- test/CodeGen/ARM/cortex-a57-misched-vldm.ll +++ test/CodeGen/ARM/cortex-a57-misched-vldm.ll @@ -11,9 +11,9 @@ ; CHECK: Data ; CHECK-SAME: Latency=5 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=5 +; CHECK-SAME: Latency=0 ; CHECK-NEXT: Data -; CHECK-SAME: Latency=6 +; CHECK-SAME: Latency=0 define double @foo(double* %a) nounwind optsize { entry: Index: test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- test/CodeGen/ARM/fp16-instructions.ll +++ test/CodeGen/ARM/fp16-instructions.ll @@ -935,9 +935,9 @@ ; CHECK-SOFTFP-FP16-T32: vmov [[S6:s[0-9]]], r0 ; CHECK-SOFTFP-FP16-T32: vldr s0, .LCP{{.*}} ; CHECK-SOFTFP-FP16-T32: vcvtb.f32.f16 [[S6]], [[S6]] -; CHECK-SOFTFP-FP16-T32: vmov.f32 [[S2:s[0-9]]], #-2.000000e+00 -; CHECK-SOFTFP-FP16-T32: vcmp.f32 [[S6]], s0 ; CHECK-SOFTFP-FP16-T32: vldr [[S4:s[0-9]]], .LCPI{{.*}} +; CHECK-SOFTFP-FP16-T32: vcmp.f32 [[S6]], s0 +; CHECK-SOFTFP-FP16-T32: vmov.f32 [[S2:s[0-9]]], #-2.000000e+00 ; CHECK-SOFTFP-FP16-T32: vmrs APSR_nzcv, fpscr ; CHECK-SOFTFP-FP16-T32: it eq ; CHECK-SOFTFP-FP16-T32: vmoveq.f32 [[S4]], [[S2]] Index: test/CodeGen/ARM/select.ll =================================================================== --- test/CodeGen/ARM/select.ll +++ test/CodeGen/ARM/select.ll @@ -80,8 +80,8 @@ ; block generated, odds are good that we have close to the ideal code for this: ; ; CHECK-NEON-LABEL: f8: -; CHECK-NEON: movw [[R3:r[0-9]+]], #1123 ; CHECK-NEON: adr [[R2:r[0-9]+]], LCPI7_0 +; CHECK-NEON: movw [[R3:r[0-9]+]], #1123 ; CHECK-NEON-NEXT: cmp r0, [[R3]] ; CHECK-NEON-NEXT: it eq ; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4 Index: test/CodeGen/ARM/twoaddrinstr.ll =================================================================== --- test/CodeGen/ARM/twoaddrinstr.ll +++ test/CodeGen/ARM/twoaddrinstr.ll @@ -4,8 +4,8 @@ define void @PR13378() nounwind { ; This was orriginally a crasher trying to schedule the instructions. ; CHECK-LABEL: PR13378: -; CHECK: vld1.32 -; CHECK-NEXT: vmov.i32 +; CHECK: vmov.i32 +; CHECK-NEXT: vld1.32 ; CHECK-NEXT: vst1.32 ; CHECK-NEXT: vst1.32 ; CHECK-NEXT: vmov.f32 Index: test/CodeGen/ARM/vcombine.ll =================================================================== --- test/CodeGen/ARM/vcombine.ll +++ test/CodeGen/ARM/vcombine.ll @@ -39,8 +39,8 @@ ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] -; CHECK-LE: vmov r0, r1, [[LD0]] ; CHECK-LE: vmov r2, r3, [[LD1]] +; CHECK-LE: vmov r0, r1, [[LD0]] ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 @@ -56,8 +56,8 @@ ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] -; CHECK-LE: vmov r0, r1, [[LD0]] ; CHECK-LE: vmov r2, r3, [[LD1]] +; CHECK-LE: vmov r0, r1, [[LD0]] ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 @@ -72,11 +72,11 @@ ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] -; CHECK-LE: vmov r0, r1, [[LD0]] ; CHECK-LE: vmov r2, r3, [[LD1]] +; CHECK-LE: vmov r0, r1, [[LD0]] -; CHECK-BE: vmov r1, r0, [[LD0]] ; CHECK-BE: vmov r3, r2, [[LD1]] +; CHECK-BE: vmov r1, r0, [[LD0]] %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -324,23 +324,23 @@ ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8. ; CHECK-LABEL: cmpsel_trunc: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d18, d19}, [r12] -; CHECK-NEXT: add r12, sp, #48 -; CHECK-NEXT: vld1.64 {d20, d21}, [r12] -; CHECK-NEXT: add r12, sp, #32 -; CHECK-NEXT: vcgt.u32 q8, q10, q8 -; CHECK-NEXT: vld1.64 {d20, d21}, [r12] -; CHECK-NEXT: vcgt.u32 q9, q10, q9 -; CHECK-NEXT: vmov d20, r2, r3 -; CHECK-NEXT: vmovn.i32 d17, q8 -; CHECK-NEXT: vmovn.i32 d16, q9 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vmovn.i16 d16, q8 -; CHECK-NEXT: vbsl d16, d18, d20 -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: add r12, sp, #16 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: add r12, sp, #48 +; CHECK-NEXT: vld1.64 {d20, d21}, [r12] +; CHECK-NEXT: add r12, sp, #32 +; CHECK-NEXT: vcgt.u32 q8, q10, q8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r12] +; CHECK-NEXT: vcgt.u32 q9, q10, q9 +; CHECK-NEXT: vmov d20, r2, r3 +; CHECK-NEXT: vmovn.i32 d17, q8 +; CHECK-NEXT: vmovn.i32 d16, q9 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vmovn.i16 d16, q8 +; CHECK-NEXT: vbsl d16, d18, d20 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %c = icmp ult <8 x i32> %cmp0, %cmp1 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 @@ -353,28 +353,28 @@ define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, ; CHECK-LABEL: vuzp_trunc_and_shuffle: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: add r12, sp, #8 -; CHECK-NEXT: add lr, sp, #24 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: ldr r12, [sp, #40] -; CHECK-NEXT: vld1.64 {d18, d19}, [lr] -; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] -; CHECK-NEXT: vmov.i8 d19, #0x7 -; CHECK-NEXT: vmovl.u8 q10, d18 -; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vuzp.8 d16, d20 -; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: pop {r11, lr} -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: add r12, sp, #8 +; CHECK-NEXT: add lr, sp, #24 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: ldr r12, [sp, #40] +; CHECK-NEXT: vld1.64 {d18, d19}, [lr] +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] +; CHECK-NEXT: vmov.i8 d19, #0x7 +; CHECK-NEXT: vmovl.u8 q10, d18 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vneg.s8 d17, d19 +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vuzp.8 d16, d20 +; CHECK-NEXT: vshl.i8 d16, d16, #7 +; CHECK-NEXT: vshl.s8 d16, d16, d17 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> @@ -389,22 +389,22 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vld1.64 {d18, d19}, [r12] -; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vmov.i8 d18, #0x7 -; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vneg.s8 d17, d18 -; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vmov.i8 d18, #0x7 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vneg.s8 d17, d18 +; CHECK-NEXT: vshl.i8 d16, d16, #7 +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vshl.s8 d16, d16, d17 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> @@ -417,23 +417,23 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vld1.64 {d18, d19}, [r12] -; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vldr d18, .LCPI22_0 -; CHECK-NEXT: vmov.i8 d19, #0x7 -; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vtbl.8 d16, {d16}, d18 -; CHECK-NEXT: vneg.s8 d17, d19 -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vshl.s8 d16, d16, d17 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vbsl d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vldr d18, .LCPI22_0 +; CHECK-NEXT: vmov.i8 d19, #0x7 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vtbl.8 d16, {d16}, d18 +; CHECK-NEXT: vneg.s8 d17, d19 +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vshl.i8 d16, d16, #7 +; CHECK-NEXT: vshl.s8 d16, d16, d17 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vbsl d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI22_0: @@ -459,55 +459,55 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, ; CHECK-LABEL: vuzp_wide_type: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: add r12, sp, #32 -; CHECK-NEXT: add lr, sp, #48 -; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #24 -; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #56 -; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] -; CHECK-NEXT: ldr r12, [sp, #68] -; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] -; CHECK-NEXT: add lr, sp, #40 -; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] -; CHECK-NEXT: ldr r4, [r12] -; CHECK-NEXT: vmov.32 d23[0], r4 -; CHECK-NEXT: add r4, sp, #64 -; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] -; CHECK-NEXT: add r4, sp, #36 -; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #28 -; CHECK-NEXT: vcgt.u32 q10, q12, q10 -; CHECK-NEXT: vmov.u8 lr, d23[3] -; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #60 -; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #52 -; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] -; CHECK-NEXT: add r4, r12, #4 -; CHECK-NEXT: vcgt.u32 q8, q9, q8 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vldr d20, .LCPI23_0 -; CHECK-NEXT: vmovn.i32 d18, q8 -; CHECK-NEXT: vmovn.i16 d22, q9 -; CHECK-NEXT: vmov.i8 q9, #0x7 -; CHECK-NEXT: vmov.8 d17[0], lr -; CHECK-NEXT: vneg.s8 q9, q9 -; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 -; CHECK-NEXT: vld1.8 {d17[1]}, [r4] -; CHECK-NEXT: add r4, sp, #8 -; CHECK-NEXT: vshl.i8 q8, q8, #7 -; CHECK-NEXT: vld1.64 {d20, d21}, [r4] -; CHECK-NEXT: vshl.s8 q8, q8, q9 -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl q8, q9, q10 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: pop {r4, lr} -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: add r12, sp, #32 +; CHECK-NEXT: add lr, sp, #48 +; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] +; CHECK-NEXT: add r12, sp, #24 +; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] +; CHECK-NEXT: add r12, sp, #56 +; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] +; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] +; CHECK-NEXT: add lr, sp, #40 +; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] +; CHECK-NEXT: ldr r12, [sp, #68] +; CHECK-NEXT: ldr r4, [r12] +; CHECK-NEXT: vmov.32 d23[0], r4 +; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] +; CHECK-NEXT: add r4, sp, #36 +; CHECK-NEXT: vcgt.u32 q10, q12, q10 +; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] +; CHECK-NEXT: add r4, sp, #28 +; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] +; CHECK-NEXT: add r4, sp, #60 +; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] +; CHECK-NEXT: add r4, sp, #52 +; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] +; CHECK-NEXT: add r4, r12, #4 +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vmovn.i32 d19, q10 +; CHECK-NEXT: vmov.u8 lr, d23[3] +; CHECK-NEXT: vldr d20, .LCPI23_0 +; CHECK-NEXT: vmovn.i32 d18, q8 +; CHECK-NEXT: vmovn.i16 d22, q9 +; CHECK-NEXT: vmov.i8 q9, #0x7 +; CHECK-NEXT: vneg.s8 q9, q9 +; CHECK-NEXT: vmov.8 d17[0], lr +; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 +; CHECK-NEXT: vld1.8 {d17[1]}, [r4] +; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: vshl.i8 q8, q8, #7 +; CHECK-NEXT: vld1.64 {d20, d21}, [r4] +; CHECK-NEXT: vshl.s8 q8, q8, q9 +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vbsl q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI23_0: Index: test/CodeGen/SystemZ/misched-readadvances.mir =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/misched-readadvances.mir @@ -0,0 +1,31 @@ +# Check that the extra operand for the full register added by RegAlloc does +# not have a latency that interferes with the latency adjustment +# (ReadAdvance) for the MSY register operand. + +# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z13 -start-before=machine-scheduler \ +# RUN: -debug-only=machine-scheduler -o - 2>&1 | FileCheck %s +# REQUIRES: asserts + +# CHECK: ScheduleDAGMI::schedule starting +# CHECK: SU(4): renamable $r2l = MSR renamable $r2l(tied-def 0), renamable $r2l +# CHECK: Latency : 6 +# CHECK: SU(5): renamable $r2l = MSY renamable $r2l(tied-def 0), renamable $r1d, -4, $noreg, implicit $r2d +# CHECK: Predecessors: +# CHECK: SU(4): Data Latency=2 Reg=$r2l +# CHECK: SU(4): Data Latency=0 Reg=$r2d + +--- +name: Perl_do_sv_dump +alignment: 4 +tracksRegLiveness: true +body: | + bb.0 : + %1:addr64bit = IMPLICIT_DEF + %2:addr64bit = IMPLICIT_DEF + %3:vr64bit = IMPLICIT_DEF + + bb.1 : + %2:addr64bit = ALGFI %2, 4294967291, implicit-def dead $cc + %2.subreg_l32:addr64bit = MSR %2.subreg_l32, %2.subreg_l32 + %2.subreg_l32:addr64bit = MSY %2.subreg_l32, %1, -4, $noreg +... Index: test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll =================================================================== --- test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll +++ test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll @@ -88,15 +88,15 @@ ; THUMBV7-NEXT: orrs r3, r2 ; THUMBV7-NEXT: ldr r2, [sp, #80] ; THUMBV7-NEXT: orr.w r1, r1, r4 +; THUMBV7-NEXT: orr.w r1, r1, r10 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r3, #1 -; THUMBV7-NEXT: orr.w r1, r1, r10 ; THUMBV7-NEXT: orrs.w r7, r2, r11 ; THUMBV7-NEXT: orr.w r1, r1, r9 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r7, #1 -; THUMBV7-NEXT: orr.w r0, r0, r12 ; THUMBV7-NEXT: ands r3, r7 +; THUMBV7-NEXT: orr.w r0, r0, r12 ; THUMBV7-NEXT: orrs r1, r3 ; THUMBV7-NEXT: orrs r0, r1 ; THUMBV7-NEXT: orr.w r0, r0, r8 Index: test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll =================================================================== --- test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll +++ test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll @@ -20,11 +20,11 @@ ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r1, #1 ; THUMBV7-NEXT: cmp r5, #0 +; THUMBV7-NEXT: and.w r1, r1, r3 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r5, #1 -; THUMBV7-NEXT: ands r1, r3 +; THUMBV7-NEXT: orrs r1, r5 ; THUMBV7-NEXT: cmp.w lr, #0 -; THUMBV7-NEXT: orr.w r1, r1, r5 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne.w lr, #1 ; THUMBV7-NEXT: orr.w r1, r1, lr Index: test/CodeGen/X86/lsr-loop-exit-cond.ll =================================================================== --- test/CodeGen/X86/lsr-loop-exit-cond.ll +++ test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -97,8 +97,8 @@ ; ATOM-NEXT: pushq %r14 ; ATOM-NEXT: pushq %rbx ; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx -; ATOM-NEXT: movl 4(%rdx), %eax ; ATOM-NEXT: movl (%rdx), %r15d +; ATOM-NEXT: movl 4(%rdx), %eax ; ATOM-NEXT: leaq 20(%rdx), %r14 ; ATOM-NEXT: movq _Te0@{{.*}}(%rip), %r9 ; ATOM-NEXT: movq _Te1@{{.*}}(%rip), %r8 @@ -116,8 +116,8 @@ ; ATOM-NEXT: movzbl %bl, %eax ; ATOM-NEXT: movl (%r10,%rax,4), %eax ; ATOM-NEXT: xorl (%r8,%rbp,4), %r15d -; ATOM-NEXT: xorl -4(%r14), %r15d ; ATOM-NEXT: xorl (%r9,%rdi,4), %eax +; ATOM-NEXT: xorl -4(%r14), %r15d ; ATOM-NEXT: xorl (%r14), %eax ; ATOM-NEXT: addq $16, %r14 ; ATOM-NEXT: LBB0_1: ## %bb @@ -130,14 +130,14 @@ ; ATOM-NEXT: movzbl %dil, %edi ; ATOM-NEXT: movl (%r8,%rdi,4), %ebx ; ATOM-NEXT: movzbl %r15b, %edi -; ATOM-NEXT: movl (%r10,%rdi,4), %edi ; ATOM-NEXT: xorl (%r9,%rbp,4), %ebx +; ATOM-NEXT: movl (%r10,%rdi,4), %edi ; ATOM-NEXT: xorl -12(%r14), %ebx ; ATOM-NEXT: xorl (%r9,%rax,4), %edi ; ATOM-NEXT: movl %ebx, %eax +; ATOM-NEXT: xorl -8(%r14), %edi ; ATOM-NEXT: shrl $24, %eax ; ATOM-NEXT: movl (%r9,%rax,4), %r15d -; ATOM-NEXT: xorl -8(%r14), %edi ; ATOM-NEXT: testq %r11, %r11 ; ATOM-NEXT: movl %edi, %eax ; ATOM-NEXT: jne LBB0_2 Index: test/CodeGen/X86/phys-reg-local-regalloc.ll =================================================================== --- test/CodeGen/X86/phys-reg-local-regalloc.ll +++ test/CodeGen/X86/phys-reg-local-regalloc.ll @@ -20,9 +20,9 @@ ; On Intel Atom the scheduler moves a movl instruction ; used for the printf call to follow movl 24(%esp), %eax ; ATOM: movl 24(%esp), %eax -; ATOM: movl -; ATOM: movl %eax, 36(%esp) ; ATOM-NOT: movl +; ATOM: movl %eax, 36(%esp) +; ATOM: movl ; ATOM: movl 28(%esp), %ebx ; ATOM-NOT: movl ; ATOM: movl %ebx, 40(%esp) Index: test/CodeGen/X86/schedule-x86-64-shld.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld.ll +++ test/CodeGen/X86/schedule-x86-64-shld.ll @@ -134,8 +134,8 @@ ; ; BTVER2-LABEL: lshift_cl_optsize: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] +; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: # kill: def $cl killed $cl killed $rcx ; BTVER2-NEXT: shldq %cl, %rsi, %rax # sched: [4:4.00] ; BTVER2-NEXT: retq # sched: [4:1.00] @@ -210,8 +210,8 @@ ; ; BTVER2-LABEL: rshift_cl_optsize: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] +; BTVER2-NEXT: movq %rdx, %rcx # sched: [1:0.50] ; BTVER2-NEXT: # kill: def $cl killed $cl killed $rcx ; BTVER2-NEXT: shrdq %cl, %rsi, %rax # sched: [4:4.00] ; BTVER2-NEXT: retq # sched: [4:1.00] Index: test/CodeGen/X86/schedule-x86_32.ll =================================================================== --- test/CodeGen/X86/schedule-x86_32.ll +++ test/CodeGen/X86/schedule-x86_32.ll @@ -451,8 +451,8 @@ ; ; ZNVER1-LABEL: test_arpl: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: arpl %ax, (%ecx) # sched: [100:0.25] ; ZNVER1-NEXT: #NO_APP @@ -620,10 +620,10 @@ ; ZNVER1-NEXT: pushl %esi # sched: [1:0.50] ; ZNVER1-NEXT: .cfi_def_cfa_offset 8 ; ZNVER1-NEXT: .cfi_offset %esi, -8 +; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] ; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50] ; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [8:0.50] -; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: bound %ax, (%esi) # sched: [100:0.25] ; ZNVER1-NEXT: bound %ecx, (%edx) # sched: [100:0.25] @@ -898,8 +898,8 @@ ; ; ZNVER1-LABEL: test_dec16: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: decw %ax # sched: [1:0.25] ; ZNVER1-NEXT: decw (%ecx) # sched: [5:0.50] @@ -1105,8 +1105,8 @@ ; ; ZNVER1-LABEL: test_inc16: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: incw %ax # sched: [1:0.25] ; ZNVER1-NEXT: incw (%ecx) # sched: [5:0.50] @@ -1777,8 +1777,8 @@ ; ; ZNVER1-LABEL: test_pop_push_16: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50] ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: popw %ax # sched: [8:0.50] ; ZNVER1-NEXT: popw (%ecx) # sched: [5:0.50]