diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -2682,7 +2682,8 @@ GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) { if (Zone.isTop()) { - if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) return true; @@ -2691,7 +2692,8 @@ TryCand, Cand, GenericSchedulerBase::TopPathReduce)) return true; } else { - if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) return true; diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -27,10 +27,10 @@ ; NONE16: fmov s1, wzr ; NONE16: fmov d2, xzr ; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; ZEROFP: ldr h0,{{.*}} -; ZEROFP: movi v{{[0-3]+}}.2d, #0 -; ZEROFP: movi v{{[0-3]+}}.2d, #0 -; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: ldr h0,{{.*}} +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -29,19 +29,19 @@ ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s0, 0xff00ff +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8 -; VI-NEXT: v_alignbit_b32 v1, s4, s4, 24 -; VI-NEXT: s_mov_b32 s4, 0xff00ff -; VI-NEXT: v_bfi_b32 v0, s4, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_alignbit_b32 v0, s2, s2, 8 +; VI-NEXT: v_alignbit_b32 v1, s2, s2, 24 +; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -265,10 +265,10 @@ ; GCN-NOT: s_add_u32 s32, s32, 0x800 ; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 -; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 -; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 ; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 ; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 @@ -336,7 +336,7 @@ ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 -; GCN: s_waitcnt vmcnt(0) +; GCN: s_waitcnt vmcnt ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 ; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -678,17 +678,15 @@ ; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 ; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 -; GCN-NOT: s_add_u32 [[SP]], - -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 -; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 +; HSA-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 +; HSA-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 ; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} ; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 -; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 +; MESA-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 +; MESA-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 ; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} ; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -197,16 +197,16 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:4 -; GCN-NEXT: global_load_dword v2, v[2:3], off +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GCN-NEXT: global_load_dword v3, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:4 -; GCN-NEXT: buffer_store_short_d16_hi v2, off, s[0:3], s9 offset:6 -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:8 +; GCN-NEXT: buffer_store_short v3, off, s[0:3], s9 offset:4 +; GCN-NEXT: buffer_store_short_d16_hi v3, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -138,9 +138,9 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s8, s4 ; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -160,32 +160,32 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: @@ -204,10 +204,10 @@ ; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -323,8 +323,8 @@ ; SI-NEXT: s_mov_b64 s[12:13], s[10:11] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff +; SI-NEXT: s_mov_b32 s12, 0xff00 +; SI-NEXT: s_movk_i32 s13, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 @@ -337,12 +337,12 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 +; SI-NEXT: v_and_b32_e32 v4, s12, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, s12, v0 +; SI-NEXT: v_and_b32_e32 v3, s13, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 +; SI-NEXT: v_and_b32_e32 v1, s13, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -276,11 +276,10 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s12, 0xff -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_movk_i32 s8, 0xff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 @@ -291,14 +290,14 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 -; SI-NEXT: v_and_b32_e32 v7, s12, v7 +; SI-NEXT: v_and_b32_e32 v7, s8, v7 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v6, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_and_b32_e32 v1, s12, v4 +; SI-NEXT: v_and_b32_e32 v1, s8, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -416,15 +415,15 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v9, v[2:3] -; VI-NEXT: flat_load_ubyte v10, v[4:5] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v9, v[4:5] ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[0:1] +; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -432,16 +431,16 @@ ; VI-NEXT: flat_load_ubyte v4, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_or_b32_e32 v4, v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v8 +; VI-NEXT: v_or_b32_e32 v0, v1, v10 ; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -609,10 +609,10 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] -; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]] ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]] +; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]] ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1326,9 +1326,9 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} -; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]] -; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] +; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} +; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] +; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -2,11 +2,11 @@ ; Test for a conv2d like sequence of loads. -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -82,8 +82,8 @@ ; CHECK-LABEL: @global_array ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 -; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -24,21 +24,21 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -88,9 +88,11 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3 @@ -104,9 +106,7 @@ ; GFX9-NEXT: v_add_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -162,15 +162,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -222,10 +222,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 @@ -313,34 +313,34 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 +; GFX9-NEXT: s_movk_i32 s8, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_mul_f32_e32 v9, v7, v1 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_mad_f32 v7, -v9, v0, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v7|, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v7, v7, s5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 @@ -422,38 +422,38 @@ ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX9-NEXT: v_xor_b32_e32 v9, s4, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1 -; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10 +; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 +; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2573,20 +2573,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s3, s2, s0 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -2597,20 +2597,20 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -356,28 +356,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -387,30 +387,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -419,30 +419,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -927,33 +927,33 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000 -; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s6 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80000 +; GFX8-NEXT: s_lshr_b32 s4, s2, 16 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80000 +; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s1 +; GFX8-NEXT: s_bfe_i32 s1, s3, 0x80000 +; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s3 +; GFX8-NEXT: s_and_b32 s3, s0, s6 +; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2 +; GFX8-NEXT: s_bfe_i32 s2, s4, 0x80000 ; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: s_and_b32 s3, s2, s3 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s4, s0, s5 +; GFX8-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NEXT: s_and_b32 s2, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -183,28 +183,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -214,20 +214,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 @@ -246,20 +246,20 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 @@ -353,28 +353,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -384,30 +384,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -416,30 +416,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1421,28 +1421,28 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s6 -; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s5, s4 -; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_sext_i32_i8 s6, s4 ; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s7, s5 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s7, s7, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-NEXT: s_and_b32 s6, s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1802,29 +1802,29 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s7, 0xff +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008 -; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GFX7-NEXT: s_lshr_b32 s9, s6, 24 -; GFX7-NEXT: s_and_b32 s6, s6, s7 -; GFX7-NEXT: s_lshr_b32 s5, s4, 24 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: s_lshr_b32 s6, s4, 24 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 +; GFX7-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-NEXT: s_and_b32 s5, s5, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -654,53 +654,53 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s6, 12 -; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40000 +; GFX8-NEXT: s_lshr_b32 s5, s2, 12 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s6, s6, 28 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX8-NEXT: s_ashr_i32 s2, s2, 28 ; GFX8-NEXT: v_mov_b32_e32 v10, s17 -; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -708,53 +708,53 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40000 +; GFX9-NEXT: s_lshr_b32 s5, s2, 12 +; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s6, s6, 28 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-NEXT: v_mov_b32_e32 v10, s17 -; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -762,53 +762,53 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 12 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 -; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1578,22 +1578,22 @@ ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[10:11], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s21, s0, 0x40008 -; GFX7-NEXT: s_ashr_i32 s15, s0, 28 -; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c ; GFX7-NEXT: s_ashr_i32 s8, s1, 28 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40014 +; GFX7-NEXT: s_bfe_i32 s18, s2, 0x40010 +; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40000 +; GFX7-NEXT: s_bfe_i32 s20, s2, 0x40004 +; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40008 +; GFX7-NEXT: s_ashr_i32 s15, s2, 28 +; GFX7-NEXT: s_bfe_i32 s2, s2, 0x4000c ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40014 ; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40000 @@ -1603,15 +1603,15 @@ ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v2, s21 ; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s1, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 @@ -1637,51 +1637,51 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX8-NEXT: s_lshr_b32 s8, s0, 4 -; GFX8-NEXT: s_lshr_b32 s9, s0, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX8-NEXT: s_lshr_b32 s8, s2, 4 +; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s0 +; GFX8-NEXT: s_lshr_b32 s15, s0, 4 +; GFX8-NEXT: s_lshr_b32 s16, s0, 8 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s7, s0, 12 -; GFX8-NEXT: s_lshr_b32 s14, s2, 12 +; GFX8-NEXT: s_lshr_b32 s7, s2, 12 +; GFX8-NEXT: s_lshr_b32 s14, s0, 12 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 16 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s13, s0, 16 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX8-NEXT: s_lshr_b32 s5, s0, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 20 +; GFX8-NEXT: s_lshr_b32 s5, s2, 20 +; GFX8-NEXT: s_lshr_b32 s12, s0, 20 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_lshr_b32 s4, s2, 24 +; GFX8-NEXT: s_lshr_b32 s11, s0, 24 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 -; GFX8-NEXT: s_lshr_b32 s1, s0, 28 -; GFX8-NEXT: s_lshr_b32 s10, s2, 28 +; GFX8-NEXT: s_lshr_b32 s1, s2, 28 +; GFX8-NEXT: s_lshr_b32 s10, s0, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1 @@ -2111,24 +2111,24 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-NEXT: s_lshr_b32 s15, s1, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-NEXT: s_lshr_b32 s8, s1, 4 +; GFX9-NEXT: s_lshr_b32 s15, s2, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-NEXT: s_lshr_b32 s17, s1, 8 +; GFX9-NEXT: s_lshr_b32 s9, s1, 12 +; GFX9-NEXT: s_lshr_b32 s10, s1, 8 +; GFX9-NEXT: s_lshr_b32 s16, s2, 12 +; GFX9-NEXT: s_lshr_b32 s17, s2, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 @@ -2144,21 +2144,21 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s11, s2, 20 +; GFX9-NEXT: s_lshr_b32 s12, s2, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 ; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-NEXT: s_lshr_b32 s14, s1, 24 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-NEXT: s_lshr_b32 s14, s2, 24 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 @@ -2180,7 +2180,7 @@ ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 @@ -2199,24 +2199,24 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s17, s2, 8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 @@ -2232,21 +2232,21 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 @@ -2268,7 +2268,7 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2418,38 +2418,38 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s2, 28 +; GFX9-NEXT: s_and_b32 s15, s2, 15 +; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s17, s2, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: s_lshr_b32 s7, s0, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s8, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v10, s2 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, s4, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, s6, v5 @@ -2460,12 +2460,12 @@ ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, s10, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 @@ -2484,38 +2484,38 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 28 +; GFX9-DL-NEXT: s_and_b32 s15, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s17, s2, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s8, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s6, v5 @@ -2526,12 +2526,12 @@ ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s10, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1108,13 +1108,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1126,13 +1126,13 @@ ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[2:3], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s0, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s4, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1144,13 +1144,13 @@ ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 -; CI-NEXT: s_load_dword s1, s[2:3], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s0, s0, 4 +; CI-NEXT: s_lshl_b32 s0, s4, 4 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -4,6 +4,7 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc +;SICI: v_mov_b32_e32 v3, 0x2000 ;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc @@ -14,7 +15,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +;SICI: buffer_atomic_swap v0, v3, s[0:3], 0 offen glc ;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -4,9 +4,9 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc ;CHECK: s_waitcnt vmcnt(0) @@ -79,9 +79,9 @@ ;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v4, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1559,22 +1559,22 @@ ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -173,8 +173,8 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]] ; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] @@ -198,8 +198,9 @@ ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} ; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} @@ -243,8 +244,9 @@ ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} ; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -40,51 +40,51 @@ ; VI-LABEL: maxnum_f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, @@ -292,17 +292,17 @@ ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -362,18 +362,18 @@ ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { @@ -429,18 +429,18 @@ ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -735,12 +735,12 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -40,51 +40,51 @@ ; VI-LABEL: minnum_f16_ieee: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, @@ -315,17 +315,17 @@ ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -415,18 +415,18 @@ ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { @@ -482,18 +482,18 @@ ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -788,12 +788,12 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -115,14 +115,14 @@ ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -193,14 +193,14 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -268,14 +268,14 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 ; CI-NEXT: v_lshr_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -478,7 +478,7 @@ ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -486,10 +486,10 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_and_b32_e32 v4, s8, v4 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 -; CI-NEXT: v_and_b32_e32 v5, s8, v5 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 +; CI-NEXT: v_and_b32_e32 v4, s4, v4 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v5, s4, v5 ; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, v9, v7 ; CI-NEXT: v_lshrrev_b32_e32 v2, v4, v2 @@ -563,13 +563,13 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s8, 0xff00ff +; CI-NEXT: s_mov_b32 s4, 0xff00ff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -134,25 +134,25 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: flat_load_ushort v8, v[4:5] -; VI-NEXT: flat_load_dword v9, v[0:1] +; VI-NEXT: flat_load_ushort v4, v[4:5] +; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: flat_load_ushort v7, v[0:1] +; VI-NEXT: flat_load_dword v8, v[2:3] +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v0, v8, v0 +; VI-NEXT: v_max_i16_e32 v4, v4, v7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v9, v1 -; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: flat_store_short v[4:5], v0 -; VI-NEXT: flat_store_dword v[6:7], v1 +; VI-NEXT: v_max_i16_e32 v6, v5, v8 +; VI-NEXT: v_max_i16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: @@ -167,22 +167,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v6, v[0:1], off -; GFX9-NEXT: global_load_dword v7, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: global_load_short_d16 v8, v[0:1], off offset:4 -; GFX9-NEXT: global_load_short_d16 v9, v[2:3], off offset:4 -; GFX9-NEXT: v_pk_max_i16 v6, v6, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_short_d16 v8, v[2:3], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_pk_max_i16 v2, v5, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v8, v9 -; GFX9-NEXT: global_store_dword v[4:5], v6, off -; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 +; GFX9-NEXT: v_pk_max_i16 v3, v7, v8 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -57,7 +57,7 @@ ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz BB1_4 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 @@ -67,7 +67,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v7, v17, v12 -; GFX9-NEXT: s_mov_b64 s[12:13], 0 +; GFX9-NEXT: s_mov_b64 s[10:11], 0 ; GFX9-NEXT: BB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 @@ -76,34 +76,34 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, v13 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15 ; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[8:9], v19, v14 -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9] -; GFX9-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v10, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v11, v9, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7] ; GFX9-NEXT: global_load_dword v8, v[8:9], off +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 +; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: ds_write_b32 v3, v8 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_cbranch_execnz BB1_2 ; GFX9-NEXT: ; %bb.3: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[12:13] -; GFX9-NEXT: BB1_4: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: BB1_4: ; %Flow3 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -89,26 +89,26 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -190,22 +190,22 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: flat_load_dword v6, v[6:7] -; VI-NEXT: flat_load_dword v4, v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_load_dword v4, v[2:3] +; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v6 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: flat_store_dword v[2:3], v6 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm @@ -214,22 +214,22 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: global_load_dword v6, v[6:7], off -; GFX9-NEXT: global_load_dword v4, v[4:5], off +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v5, v6, v4 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v4, v5 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: global_store_dword v[2:3], v5, off +; GFX9-NEXT: global_store_dword v[2:3], v6, off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -354,50 +354,50 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 -; VI-NEXT: v_addc_u32_e32 v9, vcc, v7, v5, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] +; VI-NEXT: v_add_u32_e32 v8, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, v3, v1, vcc +; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[0:1] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[6:7], v[8:9] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v5, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v1, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[2:3] +; GFX9-NEXT: global_store_dwordx2 v[6:7], v[8:9], off ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_byte v[4:5], v0, off ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 @@ -447,58 +447,58 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v9, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v2, v0 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v1 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v3 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_dwordx2 v[6:7], v[8:9] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v9, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v6, v4 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; GFX9-NEXT: v_add_u32_e32 v9, v3, v1 +; GFX9-NEXT: v_add_u32_e32 v8, v2, v0 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v1 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v3 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v[6:7], v[8:9], off +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -174,9 +174,9 @@ ; GCN-NOHSA-NOT: v_add ; CI-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-NOT: v_add -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 -; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} -; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -204,10 +204,10 @@ ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -37,10 +37,10 @@ ; CHECK: INLINEASM &"", 1, 851978, def dead [[COPY1]], 851978, def dead [[COPY]].sub1, 2147483657, [[COPY1]], 2147549193, [[COPY]].sub1 ; CHECK: %11.sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK: %11.sub3:vreg_512 = COPY [[COPY]].sub3 - ; CHECK: dead %10:vgpr_32 = V_ADD_I32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11 + ; CHECK: dead %10:vgpr_32 = V_ADD_I32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK: S_BRANCH %bb.1 bb.0: liveins: $sgpr6_sgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -240,14 +240,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 @@ -493,15 +493,15 @@ ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 -; TONGA-NEXT: s_mov_b32 s4, 0x4f800000 +; TONGA-NEXT: s_mov_b32 s6, s10 +; TONGA-NEXT: s_mov_b32 s7, s11 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s4, s2 +; TONGA-NEXT: s_mov_b32 s5, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: s_mov_b32 s2, 0x4f800000 ; TONGA-NEXT: s_mov_b32 s8, s0 ; TONGA-NEXT: s_mov_b32 s9, s1 -; TONGA-NEXT: s_mov_b32 s0, s2 -; TONGA-NEXT: s_mov_b32 s1, s3 -; TONGA-NEXT: s_mov_b32 s2, s10 -; TONGA-NEXT: s_mov_b32 s3, s11 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 @@ -519,10 +519,10 @@ ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; TONGA-NEXT: v_mul_f32_e32 v4, s4, v5 +; TONGA-NEXT: v_mul_f32_e32 v4, s2, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v4 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 -; TONGA-NEXT: v_mul_f32_e32 v5, s4, v7 +; TONGA-NEXT: v_mul_f32_e32 v5, s2, v7 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v4, v2 @@ -575,15 +575,15 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s4, 0x4f800000 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s2, 0x4f800000 ; GFX9-NEXT: s_mov_b32 s8, s0 ; GFX9-NEXT: s_mov_b32 s9, s1 -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v3 @@ -598,9 +598,9 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GFX9-NEXT: v_xor_b32_e32 v5, v4, v5 -; GFX9-NEXT: v_mul_f32_e32 v7, s4, v7 +; GFX9-NEXT: v_mul_f32_e32 v7, s2, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v7, v2 @@ -765,14 +765,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -790,14 +790,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -1471,14 +1471,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1504,14 +1504,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1614,30 +1614,30 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1 -; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0 +; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2 -; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v1, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 -; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 -; TONGA-NEXT: v_trunc_f32_e32 v2, v2 -; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 -; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 -; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 +; TONGA-NEXT: v_trunc_f32_e32 v1, v1 +; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 +; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 +; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm @@ -1647,15 +1647,15 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 +; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1 -; GFX9-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1769,39 +1769,39 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:4 +; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 +; TONGA-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 -; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 -; TONGA-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6 -; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; TONGA-NEXT: s_waitcnt vmcnt(3) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; TONGA-NEXT: s_waitcnt vmcnt(2) +; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; TONGA-NEXT: v_or_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 +; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0 ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; TONGA-NEXT: v_or_b32_e32 v1, v1, v2 -; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23 -; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 -; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 +; TONGA-NEXT: v_or_b32_e32 v2, v3, v2 +; TONGA-NEXT: v_bfe_i32 v2, v2, 0, 23 +; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 -; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 -; TONGA-NEXT: v_trunc_f32_e32 v1, v1 -; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 -; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 -; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| +; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 +; TONGA-NEXT: v_trunc_f32_e32 v2, v2 +; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 +; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 +; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm @@ -1811,37 +1811,37 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 +; GFX9-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 -; GFX9-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6 -; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 23 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 @@ -1954,28 +1954,29 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 +; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 +; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:2 -; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 -; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 offset:6 -; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; TONGA-NEXT: s_waitcnt vmcnt(2) +; TONGA-NEXT: v_or_b32_e32 v1, v1, v0 +; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; TONGA-NEXT: v_or_b32_e32 v1, v1, v2 -; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_or_b32_e32 v3, v3, v0 +; TONGA-NEXT: v_or_b32_e32 v3, v3, v2 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 ; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 @@ -2383,16 +2384,14 @@ ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_mov_b32 s4, 0x1389c755 +; TONGA-NEXT: s_mov_b32 s0, s6 +; TONGA-NEXT: s_mov_b32 s1, s7 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 -; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 -; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 -; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 +; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4 +; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4 +; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4 +; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2405,7 +2404,7 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: scalarize_mulhs_4xi32: @@ -2417,16 +2416,14 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s4, 0x1389c755 +; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 -; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 -; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 -; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 +; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4 +; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4 +; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4 +; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2439,7 +2436,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: scalarize_mulhs_4xi32: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}mul_v2i16: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -45,33 +45,33 @@ ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_mov_b32 s22, s14 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s20, s6 +; VI-NEXT: s_mov_b32 s21, s7 +; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s2, s14 +; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: buffer_load_ushort v0, off, s[20:23], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s5 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[12:15], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, @@ -128,22 +128,22 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -203,22 +203,22 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -278,27 +278,27 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -354,27 +354,27 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -447,28 +447,28 @@ ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_mov_b32 s22, s14 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s20, s6 +; VI-NEXT: s_mov_b32 s21, s7 +; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s2, s14 +; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 +; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_mov_b32 s19, s15 ; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -482,7 +482,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -551,23 +551,23 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 @@ -575,7 +575,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -646,23 +646,23 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 @@ -670,7 +670,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -705,31 +705,31 @@ ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -741,34 +741,34 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -802,33 +802,33 @@ ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s16, s2 ; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -838,34 +838,34 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: s_mov_b32 s16, s2 +; VI-NEXT: s_mov_b32 s17, s3 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -54,8 +54,8 @@ ; after 64-bit shift is split. ; GCN-LABEL: {{^}}lshr_and_i64_35: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword v[[LO:[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]] ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -13,14 +13,14 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 ; GCN-NEXT: v_lshl_b32_e32 v0, v0, v2 @@ -59,15 +59,15 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b32_e32 v3, v3, v7 ; GCN-NEXT: v_lshl_b32_e32 v2, v2, v6 @@ -409,25 +409,25 @@ ; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b64 s[12:13], s[2:3] ; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s7 -; GCN-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 -; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s2, 0xffff ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s0, 0xffff ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, v3, v2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, v0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -490,14 +490,14 @@ ; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: s_mov_b32 s8, 0xffff +; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, s8, v4 +; GCN-NEXT: v_and_b32_e32 v8, s4, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v9, s8, v5 +; GCN-NEXT: v_and_b32_e32 v9, s4, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v5, v5, v7 @@ -505,9 +505,9 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v4, v4, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_and_b32_e32 v3, s8, v3 +; GCN-NEXT: v_and_b32_e32 v3, s4, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v2, s8, v2 +; GCN-NEXT: v_and_b32_e32 v2, s4, v2 ; GCN-NEXT: v_or_b32_e32 v3, v3, v5 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 @@ -732,18 +732,18 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -116,17 +116,17 @@ ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v5, s8, v3 +; CI-NEXT: v_and_b32_e32 v5, s4, v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -194,15 +194,15 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s9, 0xffff -; CI-NEXT: s_lshr_b32 s10, s8, 16 -; CI-NEXT: s_and_b32 s8, s8, s9 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 -; CI-NEXT: v_and_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -481,14 +481,14 @@ ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v8, s8, v4 +; CI-NEXT: v_and_b32_e32 v8, s4, v4 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_and_b32_e32 v9, s8, v5 +; CI-NEXT: v_and_b32_e32 v9, s4, v5 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_lshlrev_b32_e32 v5, v5, v7 @@ -496,9 +496,9 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, v4, v6 ; CI-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s4, v2 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 @@ -569,14 +569,14 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s8, 0xff00 +; CI-NEXT: s_mov_b32 s4, 0xff00 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_and_b32_e32 v4, s8, v4 +; CI-NEXT: v_and_b32_e32 v4, s4, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v3, s4, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -112,17 +112,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 -; VI-NEXT: flat_store_dword v[2:3], v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: @@ -133,17 +133,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4 +; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_subrev_u32_e32 v3, 64, v4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: @@ -944,17 +944,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_subrev_u16_e32 v1, 64, v4 +; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: flat_store_short v[2:3], v1 -; VI-NEXT: flat_store_short v[2:3], v0 +; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: flat_store_short v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: @@ -965,17 +965,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: global_load_ushort v4, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4 +; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 -; GFX9-NEXT: global_store_short v[2:3], v1, off -; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: v_subrev_u16_e32 v3, 64, v4 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: global_store_short v[0:1], v3, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -265,12 +265,12 @@ ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 ; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 ; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 ; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -399,14 +399,14 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 @@ -423,14 +423,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 @@ -523,14 +523,14 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 @@ -547,14 +547,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -88,14 +88,14 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_load_dword s6, s[8:9], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s5, 16 -; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_sub_i32 s5, s6, s7 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: s_sub_i32 s4, s4, s6 +; VI-NEXT: s_sub_i32 s5, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -106,13 +106,13 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[4:5], 0x0 +; VI-NEXT: s_load_dword s3, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_sext_i32_i16 s0, s2 +; VI-NEXT: s_sext_i32_i16 s1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -105,32 +105,32 @@ ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_mov_b32 s2, s14 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: s_mov_b32 s11, s15 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x4900 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x4900 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s6, s14 +; VI-NEXT: s_mov_b32 s7, s15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mac_f16_e32 v2, v0, v3 +; VI-NEXT: v_mac_f16_e32 v3, v0, v2 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 -; VI-NEXT: buffer_store_short v2, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v3, off, s[12:15], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r0, half addrspace(1)* %r1, diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,17 +6,17 @@ ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load: @@ -43,18 +43,18 @@ ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load_zext_i32: @@ -83,18 +83,18 @@ ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s0, s0 -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_sext_i32_i16 s2, s2 +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load_sext_i32: @@ -204,23 +204,23 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s0, 0xff00 -; SI-NEXT: s_add_i32 s0, s0, 12 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: s_and_b32 s0, s0, 0xff -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_addk_i32 s0, 0x2c00 -; SI-NEXT: s_or_b32 s0, s0, 0x300 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s2, 0xff00 +; SI-NEXT: s_add_i32 s2, s2, 12 +; SI-NEXT: s_or_b32 s2, s2, 4 +; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_or_b32 s2, s4, s2 +; SI-NEXT: s_addk_i32 s2, 0x2c00 +; SI-NEXT: s_or_b32 s4, s2, 0x300 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_v2i8_constant_load: @@ -299,16 +299,16 @@ ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s2, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i1_constant_load: @@ -333,18 +333,18 @@ ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: @@ -373,19 +373,19 @@ ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 1 -; SI-NEXT: s_add_u32 s0, s0, 0x3e7 -; SI-NEXT: s_addc_u32 s1, 0, 0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_and_b32 s2, s2, 1 +; SI-NEXT: s_add_u32 s4, s2, 0x3e7 +; SI-NEXT: s_addc_u32 s5, 0, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: @@ -415,17 +415,17 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant32_load: @@ -453,17 +453,17 @@ ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: s_or_b32 s4, s2, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_global_invariant_load: diff --git a/llvm/test/CodeGen/PowerPC/PR33671.ll b/llvm/test/CodeGen/PowerPC/PR33671.ll --- a/llvm/test/CodeGen/PowerPC/PR33671.ll +++ b/llvm/test/CodeGen/PowerPC/PR33671.ll @@ -26,7 +26,7 @@ ret void ; CHECK-LABEL: test2 ; CHECK: addi 3, 3, 8 -; CHECK: lxvx [[LD:[0-9]+]], 0, 3 ; CHECK: addi [[REG:[0-9]+]], 4, 4 +; CHECK: lxvx [[LD:[0-9]+]], 0, 3 ; CHECK: stxvx [[LD]], 0, [[REG]] } diff --git a/llvm/test/CodeGen/PowerPC/botheightreduce.mir b/llvm/test/CodeGen/PowerPC/botheightreduce.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/botheightreduce.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=machine-scheduler -o - %s | FileCheck %s +--- +# Check that machine-scheduler's BotHeightReduce heuristic puts the LD 8 in +# between the final run of MULLDs and the LDXs that feed them, to try to hide +# the latency of the LDXs. +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $x3, $x4 + ; CHECK: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x4 + ; CHECK: [[COPY1:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x3 + ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY1]], 1 + ; CHECK: [[CMPLDI:%[0-9]+]]:crrc = CMPLDI [[COPY]], 1 + ; CHECK: [[LI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 1 + ; CHECK: [[ISEL8_:%[0-9]+]]:g8rc = ISEL8 [[COPY]], [[LI8_]], [[CMPLDI]].sub_gt + ; CHECK: MTCTR8loop [[ISEL8_]], implicit-def dead $ctr8 + ; CHECK: [[LI8_1:%[0-9]+]]:g8rc = LI8 0 + ; CHECK: [[LI8_2:%[0-9]+]]:g8rc = LI8 2 + ; CHECK: [[LI8_3:%[0-9]+]]:g8rc = LI8 3 + ; CHECK: [[LI8_4:%[0-9]+]]:g8rc = LI8 5 + ; CHECK: [[LI8_5:%[0-9]+]]:g8rc = LI8 6 + ; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 + ; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8) + ; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8) + ; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8) + ; CHECK: [[LD1:%[0-9]+]]:g8rc = LD 4, [[ADDI8_]] :: (load 8) + ; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8) + ; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8) + ; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8) + ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) + ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] + ; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8) + ; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]] + ; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]] + ; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]] + ; CHECK: [[MULLD4:%[0-9]+]]:g8rc = MULLD [[MULLD3]], [[LDX2]] + ; CHECK: [[MULLD5:%[0-9]+]]:g8rc = MULLD [[MULLD4]], [[LDX3]] + ; CHECK: [[MULLD6:%[0-9]+]]:g8rc = MULLD [[MULLD5]], [[LDX4]] + ; CHECK: [[MADDLD8_:%[0-9]+]]:g8rc = MADDLD8 [[MULLD6]], [[LD2]], [[MADDLD8_]] + ; CHECK: [[COPY2:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY [[ADDI8_1]] + ; CHECK: BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8 + ; CHECK: B %bb.2 + ; CHECK: bb.2: + bb.0: + liveins: $x3, $x4 + + %0:g8rc_and_g8rc_nox0 = COPY $x4 + %1:g8rc_and_g8rc_nox0 = COPY $x3 + %2:g8rc_and_g8rc_nox0 = ADDI8 %1, 1 + %3:crrc = CMPLDI %0, 1 + %4:g8rc_and_g8rc_nox0 = LI8 1 + %5:g8rc = ISEL8 %0, %4, %3.sub_gt + MTCTR8loop %5, implicit-def dead $ctr8 + %6:g8rc = LI8 0 + %7:g8rc = LI8 2 + %8:g8rc = LI8 3 + %9:g8rc = LI8 5 + %10:g8rc = LI8 6 + %11:g8rc = LI8 7 + + bb.1: + %12:g8rc = ADDI8 %2, 1 + %13:g8rc = LD 0, %2 :: (load 8) + %14:g8rc = LDX %2, %4 :: (load 8) + %16:g8rc = LDX %2, %8 :: (load 8) + %17:g8rc = LD 4, %2 :: (load 8) + %18:g8rc = LDX %2, %9 :: (load 8) + %19:g8rc = LDX %2, %10 :: (load 8) + %20:g8rc = LDX %2, %11 :: (load 8) + %21:g8rc = LD 8, %2 :: (load 8) + %22:g8rc = MULLD %14, %13 + %15:g8rc = LDX %2, %7 :: (load 8) + %23:g8rc = MULLD %22, %15 + %24:g8rc = MULLD %23, %16 + %25:g8rc = MULLD %24, %17 + %26:g8rc = MULLD %25, %18 + %27:g8rc = MULLD %26, %19 + %28:g8rc = MULLD %27, %20 + %6:g8rc = MADDLD8 %28, %21, %6 + %2:g8rc_and_g8rc_nox0 = COPY %12 + BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8 + B %bb.2 + + bb.2: +... diff --git a/llvm/test/CodeGen/PowerPC/dform-adjust.ll b/llvm/test/CodeGen/PowerPC/dform-adjust.ll --- a/llvm/test/CodeGen/PowerPC/dform-adjust.ll +++ b/llvm/test/CodeGen/PowerPC/dform-adjust.ll @@ -5,18 +5,18 @@ ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li 5, -13 -; CHECK-NEXT: lxvx 0, 3, 5 -; CHECK-NEXT: li 5, 19 -; CHECK-NEXT: lxvx 1, 3, 5 -; CHECK-NEXT: li 5, 3 ; CHECK-NEXT: li 6, 7 ; CHECK-NEXT: li 7, 11 ; CHECK-NEXT: li 8, 15 -; CHECK-NEXT: mfvsrld 9, 0 -; CHECK-NEXT: ldx 5, 3, 5 +; CHECK-NEXT: lxvx 0, 3, 5 +; CHECK-NEXT: li 5, 19 ; CHECK-NEXT: ldx 6, 3, 6 ; CHECK-NEXT: ldx 7, 3, 7 +; CHECK-NEXT: lxvx 1, 3, 5 +; CHECK-NEXT: li 5, 3 +; CHECK-NEXT: ldx 5, 3, 5 ; CHECK-NEXT: ldx 3, 3, 8 +; CHECK-NEXT: mfvsrld 9, 0 ; CHECK-NEXT: mffprd 8, 0 ; CHECK-NEXT: mfvsrld 10, 1 ; CHECK-NEXT: mfvsrd 11, 1 diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll --- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -508,9 +508,9 @@ ; CHECK-P9-BE-LABEL: test_consecutive_i32: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-BE-NEXT: blr entry: @@ -544,9 +544,9 @@ ; CHECK-P9-LABEL: test_consecutive_float: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-NEXT: blr ; @@ -597,9 +597,9 @@ ; CHECK-P9-LABEL: test_stores_exceed_vec_size: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha +; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxvx vs35, 0, r3 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: li r3, 16 ; CHECK-P9-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-NEXT: li r3, 20 @@ -611,10 +611,10 @@ ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 ; CHECK-P9-BE-NEXT: li r3, 16 ; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 ; CHECK-P9-BE-NEXT: li r3, 20 +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 ; CHECK-P9-BE-NEXT: stxv vs0, 0(r5) ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 @@ -676,9 +676,9 @@ ; CHECK-P9-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 +; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12 -; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15 ; CHECK-P9-NEXT: li r3, 2 @@ -694,9 +694,9 @@ ; CHECK-P9-BE-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 +; CHECK-P9-BE-NEXT: li r3, 1 ; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5 -; CHECK-P9-BE-NEXT: li r3, 1 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2 ; CHECK-P9-BE-NEXT: li r3, 2 @@ -807,9 +807,9 @@ ; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 +; CHECK-P9-NEXT: li r3, 5 ; CHECK-P9-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 8 -; CHECK-P9-NEXT: li r3, 5 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 13 ; CHECK-P9-NEXT: li r3, 6 @@ -848,9 +848,9 @@ ; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 +; CHECK-P9-BE-NEXT: li r3, 5 ; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 9 -; CHECK-P9-BE-NEXT: li r3, 5 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 4 ; CHECK-P9-BE-NEXT: li r3, 6 @@ -947,8 +947,8 @@ ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 ; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 ; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7 +; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -996,9 +996,9 @@ ; CHECK-P9-BE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 +; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9 ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3 ; CHECK-P9-BE-NEXT: li r3, 8 ; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3 diff --git a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll --- a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll +++ b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll @@ -229,8 +229,8 @@ ; CHECK-LABEL: testMixedAggregate_03: ; CHECK: # %bb.0: # %entry ; CHECK: mtvsrwa v2, r3 -; CHECK: xscvsdqp v2, v2 -; CHECK: mtvsrdd v3, r6, r5 +; CHECK-DAG: xscvsdqp v2, v2 +; CHECK-DAG: mtvsrdd v3, r6, r5 ; CHECK: xsaddqp v2, v3, v2 ; CHECK: mtvsrd v[[REG1:[0-9]+]], r10 ; CHECK: xscvsdqp v[[REG:[0-9]+]], v[[REG1]] @@ -351,12 +351,12 @@ ; CHECK-NEXT: bltlr cr0 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi r3, r1, 40 +; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72 ; CHECK-NEXT: lxvx v3, 0, r3 +; CHECK-NEXT: std [[REG2]], -8(r1) ; CHECK-NEXT: xsaddqp v2, v3, v2 ; CHECK-NEXT: lxv v3, 16(r3) ; CHECK-NEXT: xsaddqp v2, v2, v3 -; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72 -; CHECK-NEXT: std [[REG2]], -8(r1) ; CHECK-NEXT: blr entry: %ap = alloca i8*, align 8 diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll --- a/llvm/test/CodeGen/PowerPC/f128-conv.ll +++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll @@ -444,10 +444,10 @@ ; CHECK-LABEL: qpConv2dp_03: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, .LC7@toc@ha +; CHECK-NEXT: sldi r4, r4, 3 ; CHECK-NEXT: ld r5, .LC7@toc@l(r5) ; CHECK-NEXT: lxvx v2, 0, r5 ; CHECK-NEXT: xscvqpdp v2, v2 -; CHECK-NEXT: sldi r4, r4, 3 ; CHECK-NEXT: stxsdx v2, r3, r4 ; CHECK-NEXT: blr entry: @@ -517,11 +517,11 @@ ; CHECK-LABEL: qpConv2sp_03: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, .LC7@toc@ha +; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: ld r5, .LC7@toc@l(r5) ; CHECK-NEXT: lxv v2, 48(r5) ; CHECK-NEXT: xscvqpdpo v2, v2 ; CHECK-NEXT: xsrsp f0, v2 -; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: stfsx f0, r3, r4 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/f128-passByValue.ll b/llvm/test/CodeGen/PowerPC/f128-passByValue.ll --- a/llvm/test/CodeGen/PowerPC/f128-passByValue.ll +++ b/llvm/test/CodeGen/PowerPC/f128-passByValue.ll @@ -154,13 +154,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK: lwz r3, 96(r1) ; CHECK: add r4, r7, r9 +; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1 ; CHECK: add r4, r4, r10 +; CHECK: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK: add r3, r4, r3 ; CHECK: clrldi r3, r3, 32 ; CHECK: std r3, 0(r6) ; CHECK: lxv v[[REG1:[0-9]+]], 0(r8) -; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1 -; CHECK: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK: xsaddqp v2, v[[REG1]], v2 ; CHECK: xsaddqp v2, v2, v3 ; CHECK-NEXT: blr @@ -186,13 +186,13 @@ ; CHECK-LABEL: mixParam_02f: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: add r4, r4, r6 +; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1 ; CHECK-NEXT: add r4, r4, r7 +; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK-NEXT: add r4, r4, r8 ; CHECK-NEXT: clrldi r4, r4, 32 ; CHECK-DAG: std r4, 0(r3) ; CHECK-DAG: lxv v[[REG1:[0-9]+]], 0(r5) -; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1 -; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK-NEXT: xsaddqp v2, v[[REG1]], v2 ; CHECK-NEXT: xsaddqp v2, v2, v[[REG0]] ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll b/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll --- a/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll +++ b/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll @@ -32,10 +32,19 @@ ; CHECK-NEXT: std 0, 16(1) ; CHECK-NEXT: stdu 1, -192(1) ; CHECK-NEXT: addis 3, 2, a1@toc@ha +; CHECK-NEXT: addis 6, 2, a17@toc@ha +; CHECK-NEXT: addis 5, 2, a16@toc@ha +; CHECK-NEXT: addis 4, 2, a15@toc@ha ; CHECK-NEXT: lfd 1, a1@toc@l(3) ; CHECK-NEXT: addis 3, 2, a2@toc@ha +; CHECK-NEXT: addi 6, 6, a17@toc@l +; CHECK-NEXT: addi 5, 5, a16@toc@l +; CHECK-NEXT: ld 4, a15@toc@l(4) ; CHECK-NEXT: lfd 2, a2@toc@l(3) ; CHECK-NEXT: addis 3, 2, a3@toc@ha +; CHECK-NEXT: lxvx 34, 0, 6 +; CHECK-NEXT: lxvx 0, 0, 5 +; CHECK-NEXT: li 5, 168 ; CHECK-NEXT: lfd 3, a3@toc@l(3) ; CHECK-NEXT: addis 3, 2, a4@toc@ha ; CHECK-NEXT: lfd 4, a4@toc@l(3) @@ -52,15 +61,6 @@ ; CHECK-NEXT: addis 3, 2, a10@toc@ha ; CHECK-NEXT: lfd 10, a10@toc@l(3) ; CHECK-NEXT: addis 3, 2, a11@toc@ha -; CHECK-NEXT: addis 6, 2, a17@toc@ha -; CHECK-NEXT: addis 5, 2, a16@toc@ha -; CHECK-NEXT: addi 6, 6, a17@toc@l -; CHECK-NEXT: addi 5, 5, a16@toc@l -; CHECK-NEXT: lxvx 34, 0, 6 -; CHECK-NEXT: addis 4, 2, a15@toc@ha -; CHECK-NEXT: lxvx 0, 0, 5 -; CHECK-NEXT: ld 4, a15@toc@l(4) -; CHECK-NEXT: li 5, 168 ; CHECK-NEXT: lfd 11, a11@toc@l(3) ; CHECK-NEXT: addis 3, 2, a12@toc@ha ; CHECK-NEXT: lfd 12, a12@toc@l(3) diff --git a/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll b/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll --- a/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll +++ b/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll @@ -132,8 +132,8 @@ ; CHECK-P9-BE-LABEL: load_swap11: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr @@ -208,8 +208,8 @@ ; CHECK-P9-BE-LABEL: load_swap21: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr @@ -382,8 +382,8 @@ ; CHECK-P9-BE-LABEL: load_swap51: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI10_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -23,11 +23,11 @@ ; CHECK: .LBB0_2: # ; CHECK-NEXT: ldx r9, r3, r6 ; CHECK-NEXT: ldx r10, r3, r7 -; CHECK-NEXT: mulld r9, r10, r9 ; CHECK-NEXT: ldx r11, r3, r8 -; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: ld r12, 0(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mulld r9, r10, r9 +; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB0_2 %3 = sext i32 %1 to i64 @@ -87,11 +87,11 @@ ; CHECK: .LBB1_2: # ; CHECK-NEXT: ldx r9, r6, r7 ; CHECK-NEXT: ld r10, 0(r6) -; CHECK-NEXT: mulld r9, r10, r9 ; CHECK-NEXT: ldx r11, r6, r5 -; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: addi r8, r6, 1 ; CHECK-NEXT: ld r6, 4(r6) +; CHECK-NEXT: mulld r9, r10, r9 +; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: maddld r3, r9, r6, r3 ; CHECK-NEXT: mr r6, r8 ; CHECK-NEXT: bdnz .LBB1_2 @@ -162,22 +162,22 @@ ; CHECK: .LBB2_2: # ; CHECK-NEXT: ldx r12, r9, r6 ; CHECK-NEXT: ld r0, 0(r9) -; CHECK-NEXT: mulld r12, r0, r12 +; CHECK-NEXT: ldx r30, r9, r5 +; CHECK-NEXT: ldx r29, r9, r7 ; CHECK-NEXT: addi r11, r9, 1 -; CHECK-NEXT: ldx r30, r9, r7 -; CHECK-NEXT: ld r29, 4(r9) -; CHECK-NEXT: ldx r28, r9, r8 -; CHECK-NEXT: ld r27, 12(r9) -; CHECK-NEXT: ld r26, 8(r9) -; CHECK-NEXT: ldx r25, r9, r10 -; CHECK-NEXT: ldx r9, r9, r5 -; CHECK-NEXT: mulld r9, r12, r9 -; CHECK-NEXT: mulld r9, r9, r30 -; CHECK-NEXT: mulld r9, r9, r29 -; CHECK-NEXT: mulld r9, r9, r28 -; CHECK-NEXT: mulld r9, r9, r27 -; CHECK-NEXT: mulld r9, r9, r26 -; CHECK-NEXT: maddld r3, r9, r25, r3 +; CHECK-NEXT: mulld r12, r0, r12 +; CHECK-NEXT: ld r28, 4(r9) +; CHECK-NEXT: ldx r27, r9, r8 +; CHECK-NEXT: ld r26, 12(r9) +; CHECK-NEXT: ld r25, 8(r9) +; CHECK-NEXT: ldx r9, r9, r10 +; CHECK-NEXT: mulld r12, r12, r30 +; CHECK-NEXT: mulld r12, r12, r29 +; CHECK-NEXT: mulld r12, r12, r28 +; CHECK-NEXT: mulld r12, r12, r27 +; CHECK-NEXT: mulld r12, r12, r26 +; CHECK-NEXT: mulld r12, r12, r25 +; CHECK-NEXT: maddld r3, r12, r9, r3 ; CHECK-NEXT: mr r9, r11 ; CHECK-NEXT: bdnz .LBB2_2 %3 = sext i32 %1 to i64 @@ -257,10 +257,10 @@ ; CHECK: .LBB3_2: # ; CHECK-NEXT: ldu r8, 4(r3) ; CHECK-NEXT: ldx r9, r3, r7 -; CHECK-NEXT: mulld r8, r8, r9 ; CHECK-NEXT: ldx r10, r3, r6 -; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: ld r11, 4(r3) +; CHECK-NEXT: mulld r8, r8, r9 +; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: maddld r5, r8, r11, r5 ; CHECK-NEXT: bdnz .LBB3_2 %3 = sext i32 %1 to i64 @@ -391,21 +391,21 @@ ; CHECK: .LBB5_2: # ; CHECK-NEXT: ld r8, 0(r3) ; CHECK-NEXT: ldx r9, r3, r7 -; CHECK-NEXT: mulld r8, r9, r8 -; CHECK-NEXT: ld r9, 4(r3) -; CHECK-NEXT: mulld r8, r8, r9 -; CHECK-NEXT: ld r10, 8(r3) +; CHECK-NEXT: ld r10, 4(r3) +; CHECK-NEXT: ld r11, 8(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mulld r8, r9, r8 +; CHECK-NEXT: ld r12, 0(r4) +; CHECK-NEXT: ldx r0, r4, r7 +; CHECK-NEXT: ld r30, 4(r4) +; CHECK-NEXT: ld r9, 8(r4) +; CHECK-NEXT: addi r4, r4, 1 ; CHECK-NEXT: mulld r8, r8, r10 -; CHECK-NEXT: ld r11, 0(r4) ; CHECK-NEXT: mulld r8, r8, r11 -; CHECK-NEXT: ldx r12, r4, r7 ; CHECK-NEXT: mulld r8, r8, r12 -; CHECK-NEXT: ld r0, 4(r4) ; CHECK-NEXT: mulld r8, r8, r0 -; CHECK-NEXT: ld r30, 8(r4) -; CHECK-NEXT: addi r4, r4, 1 -; CHECK-NEXT: maddld r6, r8, r30, r6 +; CHECK-NEXT: mulld r8, r8, r30 +; CHECK-NEXT: maddld r6, r8, r9, r6 ; CHECK-NEXT: bdnz .LBB5_2 %4 = sext i32 %2 to i64 %5 = icmp eq i32 %2, 0 @@ -501,11 +501,11 @@ ; CHECK-NEXT: add r29, r0, r29 ; CHECK-NEXT: .LBB6_3: # ; CHECK-NEXT: mulld r0, r29, r28 +; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r0, r30 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 ; CHECK-NEXT: maddld r3, r0, r7, r3 -; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: bdz .LBB6_9 ; CHECK-NEXT: .LBB6_4: # ; CHECK-NEXT: lbzu r0, 1(r5) @@ -711,10 +711,10 @@ ; CHECK-NEXT: lfsx f0, r3, r4 ; CHECK-NEXT: xscvuxdsp f4, f4 ; CHECK-NEXT: lfs f2, 20(r3) -; CHECK-NEXT: xsmulsp f0, f0, f4 -; CHECK-NEXT: xsmulsp f0, f2, f0 ; CHECK-NEXT: lfs f3, 60(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: xsmulsp f0, f0, f4 +; CHECK-NEXT: xsmulsp f0, f2, f0 ; CHECK-NEXT: xsmulsp f0, f3, f0 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB8_2 diff --git a/llvm/test/CodeGen/PowerPC/machine-pre.ll b/llvm/test/CodeGen/PowerPC/machine-pre.ll --- a/llvm/test/CodeGen/PowerPC/machine-pre.ll +++ b/llvm/test/CodeGen/PowerPC/machine-pre.ll @@ -111,10 +111,10 @@ ; CHECK-P9-NEXT: b .LBB1_2 ; CHECK-P9-NEXT: .LBB1_7: # %while.end ; CHECK-P9-NEXT: lis r3, -13108 -; CHECK-P9-NEXT: ori r3, r3, 52429 -; CHECK-P9-NEXT: mullw r3, r28, r3 ; CHECK-P9-NEXT: lis r4, 13107 +; CHECK-P9-NEXT: ori r3, r3, 52429 ; CHECK-P9-NEXT: ori r4, r4, 13108 +; CHECK-P9-NEXT: mullw r3, r28, r3 ; CHECK-P9-NEXT: cmplw r3, r4 ; CHECK-P9-NEXT: blt cr0, .LBB1_9 ; CHECK-P9-NEXT: # %bb.8: # %if.then8 diff --git a/llvm/test/CodeGen/PowerPC/mi-peephole-splat.ll b/llvm/test/CodeGen/PowerPC/mi-peephole-splat.ll --- a/llvm/test/CodeGen/PowerPC/mi-peephole-splat.ll +++ b/llvm/test/CodeGen/PowerPC/mi-peephole-splat.ll @@ -89,8 +89,8 @@ ; CHECK-P9LE-NEXT: nop ; CHECK-P9LE-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; CHECK-P9LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-P9LE-NEXT: xxmrghd 0, 0, 1 ; CHECK-P9LE-NEXT: lxv 63, 48(1) # 16-byte Folded Reload +; CHECK-P9LE-NEXT: xxmrghd 0, 0, 1 ; CHECK-P9LE-NEXT: xxswapd 1, 0 ; CHECK-P9LE-NEXT: xssubdp 1, 1, 0 ; CHECK-P9LE-NEXT: addi 1, 1, 64 @@ -116,8 +116,8 @@ ; CHECK-P9BE-NEXT: nop ; CHECK-P9BE-NEXT: lxv 0, 112(1) # 16-byte Folded Reload ; CHECK-P9BE-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-P9BE-NEXT: xxmrghd 0, 0, 1 ; CHECK-P9BE-NEXT: lxv 63, 128(1) # 16-byte Folded Reload +; CHECK-P9BE-NEXT: xxmrghd 0, 0, 1 ; CHECK-P9BE-NEXT: xxswapd 1, 0 ; CHECK-P9BE-NEXT: xssubdp 1, 0, 1 ; CHECK-P9BE-NEXT: addi 1, 1, 144 diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll @@ -1398,10 +1398,10 @@ ; PC64LE9-NEXT: li 3, 0 ; PC64LE9-NEXT: xxlxor 2, 2, 2 ; PC64LE9-NEXT: xxlxor 4, 4, 4 +; PC64LE9-NEXT: mr 30, 4 ; PC64LE9-NEXT: std 3, 8(4) ; PC64LE9-NEXT: fmr 1, 31 ; PC64LE9-NEXT: fmr 3, 31 -; PC64LE9-NEXT: mr 30, 4 ; PC64LE9-NEXT: stfd 31, 0(4) ; PC64LE9-NEXT: bl __gcc_qadd ; PC64LE9-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -13,31 +13,31 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lfd f0, 0(r5) ; CHECK-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: xxpermdi v5, f0, f0, 2 ; CHECK-NEXT: addi r5, r5, .LCPI0_0@toc@l ; CHECK-NEXT: lxvx v2, 0, r5 ; CHECK-NEXT: addis r5, r2, .LCPI0_1@toc@ha ; CHECK-NEXT: addi r5, r5, .LCPI0_1@toc@l ; CHECK-NEXT: lxvx v4, 0, r5 -; CHECK-NEXT: xxpermdi v5, f0, f0, 2 -; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: li r5, 4 ; CHECK-NEXT: vperm v0, v3, v5, v2 ; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: li r5, 0 ; CHECK-NEXT: vperm v1, v5, v3, v4 -; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: xvnegsp v5, v0 ; CHECK-NEXT: xvnegsp v0, v1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader ; CHECK-NEXT: # ; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: add r7, r3, r4 ; CHECK-NEXT: xxpermdi v1, f0, f0, 2 ; CHECK-NEXT: lfdx f0, r3, r4 ; CHECK-NEXT: vperm v6, v1, v3, v4 ; CHECK-NEXT: vperm v1, v3, v1, v2 ; CHECK-NEXT: xvnegsp v1, v1 -; CHECK-NEXT: add r7, r3, r4 ; CHECK-NEXT: xvnegsp v6, v6 ; CHECK-NEXT: vabsduw v1, v1, v5 ; CHECK-NEXT: vabsduw v6, v6, v0 @@ -72,25 +72,26 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfd f0, 0(r5) ; P9BE-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 +; P9BE-NEXT: li r6, 0 ; P9BE-NEXT: addi r5, r5, .LCPI0_0@toc@l ; P9BE-NEXT: lxvx v2, 0, r5 ; P9BE-NEXT: addis r5, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: xxlor v5, vs0, vs0 ; P9BE-NEXT: addi r5, r5, .LCPI0_1@toc@l ; P9BE-NEXT: lxvx v4, 0, r5 ; P9BE-NEXT: li r5, 4 -; P9BE-NEXT: xxlor v5, vs0, vs0 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: vperm v0, v3, v5, v2 ; P9BE-NEXT: mtctr r5 ; P9BE-NEXT: li r5, 0 ; P9BE-NEXT: vperm v1, v3, v5, v4 -; P9BE-NEXT: li r6, 0 ; P9BE-NEXT: xvnegsp v5, v0 ; P9BE-NEXT: xvnegsp v0, v1 ; P9BE-NEXT: .p2align 4 ; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader ; P9BE-NEXT: # ; P9BE-NEXT: lfd f0, 0(r3) +; P9BE-NEXT: add r7, r3, r4 ; P9BE-NEXT: xxlor v1, vs0, vs0 ; P9BE-NEXT: lfdx f0, r3, r4 ; P9BE-NEXT: vperm v6, v3, v1, v4 @@ -107,20 +108,19 @@ ; P9BE-NEXT: xxlor v6, vs0, vs0 ; P9BE-NEXT: vperm v7, v3, v6, v4 ; P9BE-NEXT: vperm v6, v3, v6, v2 -; P9BE-NEXT: add r7, r3, r4 ; P9BE-NEXT: vextuwlx r3, r5, v1 ; P9BE-NEXT: xvnegsp v6, v6 +; P9BE-NEXT: add r6, r3, r6 ; P9BE-NEXT: xvnegsp v1, v7 -; P9BE-NEXT: vabsduw v1, v1, v0 +; P9BE-NEXT: add r3, r7, r4 ; P9BE-NEXT: vabsduw v6, v6, v5 +; P9BE-NEXT: vabsduw v1, v1, v0 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: xxswapd v6, v1 -; P9BE-NEXT: add r6, r3, r6 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: xxspltw v6, v1, 1 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: vextuwlx r8, r5, v1 -; P9BE-NEXT: add r3, r7, r4 ; P9BE-NEXT: add r6, r8, r6 ; P9BE-NEXT: bdnz .LBB0_1 ; P9BE-NEXT: # %bb.2: # %for.cond.cleanup @@ -183,15 +183,16 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lfd f0, 0(r3) ; CHECK-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: addis r3, r2, .LCPI1_1@toc@ha ; CHECK-NEXT: xxpermdi v2, f0, f0, 2 ; CHECK-NEXT: lfd f0, 0(r4) ; CHECK-NEXT: addi r3, r3, .LCPI1_1@toc@l -; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: lxvx v0, 0, r3 ; CHECK-NEXT: xxpermdi v1, f0, f0, 2 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: vperm v5, v2, v3, v4 ; CHECK-NEXT: vperm v2, v3, v2, v0 ; CHECK-NEXT: vperm v0, v3, v1, v0 @@ -200,7 +201,6 @@ ; CHECK-NEXT: vabsduw v3, v5, v3 ; CHECK-NEXT: vadduwm v2, v3, v2 ; CHECK-NEXT: xxswapd v3, v2 -; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: vadduwm v2, v2, v3 ; CHECK-NEXT: xxspltw v3, v2, 2 ; CHECK-NEXT: vadduwm v2, v2, v3 @@ -212,6 +212,7 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfd f0, 0(r3) ; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha @@ -219,8 +220,8 @@ ; P9BE-NEXT: xxlor v2, vs0, vs0 ; P9BE-NEXT: lfd f0, 0(r4) ; P9BE-NEXT: lxvx v0, 0, r3 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: xxlor v1, vs0, vs0 +; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vperm v5, v3, v2, v4 ; P9BE-NEXT: vperm v2, v3, v2, v0 ; P9BE-NEXT: vperm v0, v3, v1, v0 @@ -232,7 +233,6 @@ ; P9BE-NEXT: vadduwm v2, v2, v3 ; P9BE-NEXT: xxspltw v3, v2, 1 ; P9BE-NEXT: vadduwm v2, v2, v3 -; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuwlx r3, r3, v2 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: blr @@ -288,12 +288,12 @@ ; CHECK-NEXT: add r5, r3, r4 ; CHECK-NEXT: lfiwzx f0, r3, r4 ; CHECK-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: xxpermdi v2, f0, f0, 2 ; CHECK-NEXT: addi r3, r3, .LCPI2_0@toc@l ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: xxpermdi v2, f0, f0, 2 ; CHECK-NEXT: lfiwzx f0, r5, r3 -; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: xxpermdi v5, f0, f0, 2 ; CHECK-NEXT: vperm v3, v5, v3, v4 @@ -311,12 +311,12 @@ ; P9BE-NEXT: add r5, r3, r4 ; P9BE-NEXT: lfiwzx f0, r3, r4 ; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 +; P9BE-NEXT: xxsldwi v2, f0, f0, 1 ; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l ; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: xxsldwi v2, f0, f0, 1 ; P9BE-NEXT: lfiwzx f0, r5, r3 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: vperm v2, v3, v2, v4 ; P9BE-NEXT: xxsldwi v5, f0, f0, 1 ; P9BE-NEXT: vperm v3, v3, v5, v4 @@ -356,13 +356,13 @@ ; CHECK-LABEL: test16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sldi r4, r4, 1 -; CHECK-NEXT: lxsihzx v2, r3, r4 -; CHECK-NEXT: vsplth v2, v2, 3 ; CHECK-NEXT: xxlxor v3, v3, v3 -; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: lxsihzx v2, r3, r4 ; CHECK-NEXT: vsplth v4, v3, 7 ; CHECK-NEXT: add r6, r3, r4 ; CHECK-NEXT: li r3, 16 +; CHECK-NEXT: vsplth v2, v2, 3 +; CHECK-NEXT: vmrglh v2, v3, v2 ; CHECK-NEXT: vmrglw v2, v2, v4 ; CHECK-NEXT: lxsihzx v4, r6, r3 ; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha @@ -382,17 +382,17 @@ ; P9BE-LABEL: test16: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: sldi r4, r4, 1 -; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: li r7, 16 -; P9BE-NEXT: lxsihzx v2, r6, r7 +; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: lxsihzx v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: lxsihzx v2, r6, r7 ; P9BE-NEXT: li r6, 0 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; P9BE-NEXT: sldi r6, r6, 48 ; P9BE-NEXT: vsplth v4, v4, 3 ; P9BE-NEXT: mtvsrd v3, r6 ; P9BE-NEXT: vsplth v2, v2, 3 -; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; P9BE-NEXT: vmrghh v4, v3, v4 ; P9BE-NEXT: vmrghh v2, v3, v2 ; P9BE-NEXT: vsplth v3, v3, 0 @@ -448,11 +448,11 @@ ; CHECK-NEXT: li r3, 8 ; CHECK-NEXT: lxsibzx v5, r6, r3 ; CHECK-NEXT: xxswapd v3, vs0 -; CHECK-NEXT: vspltb v4, v3, 15 -; CHECK-NEXT: vspltb v2, v2, 7 -; CHECK-NEXT: vmrglb v2, v3, v2 ; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-NEXT: vspltb v2, v2, 7 ; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-NEXT: vspltb v4, v3, 15 +; CHECK-NEXT: vmrglb v2, v3, v2 ; CHECK-NEXT: vspltb v5, v5, 7 ; CHECK-NEXT: vmrglh v2, v2, v4 ; CHECK-NEXT: vmrglb v3, v3, v5 @@ -473,9 +473,11 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: li r7, 8 -; P9BE-NEXT: lxsibzx v2, r6, r7 ; P9BE-NEXT: lxsibzx v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: lxsibzx v2, r6, r7 ; P9BE-NEXT: li r6, 0 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P9BE-NEXT: sldi r6, r6, 56 ; P9BE-NEXT: vspltb v4, v4, 7 ; P9BE-NEXT: mtvsrd v3, r6 @@ -483,8 +485,6 @@ ; P9BE-NEXT: vmrghb v4, v3, v4 ; P9BE-NEXT: vmrghb v2, v3, v2 ; P9BE-NEXT: vspltb v3, v3, 0 -; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P9BE-NEXT: vmrghh v4, v4, v3 ; P9BE-NEXT: xxspltw v3, v3, 0 ; P9BE-NEXT: vmrghw v2, v4, v2 diff --git a/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll b/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll --- a/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll +++ b/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll @@ -40,8 +40,8 @@ ; PPC64LE-NEXT: std 0, 16(1) ; PPC64LE-NEXT: stdu 1, -32(1) ; PPC64LE-NEXT: addis 3, 2, .LC0@toc@ha -; PPC64LE-NEXT: ld 3, .LC0@toc@l(3) ; PPC64LE-NEXT: li 4, 0 +; PPC64LE-NEXT: ld 3, .LC0@toc@l(3) ; PPC64LE-NEXT: std 4, 0(3) ; PPC64LE-NEXT: bl barney.94 ; PPC64LE-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_2.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_2.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_2.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_2.ll @@ -67,11 +67,11 @@ ; P9LE: # %bb.0: ; P9LE-NEXT: lfiwzx f0, 0, r3 ; P9LE-NEXT: lfiwzx f1, 0, r4 +; P9LE-NEXT: mr r3, r5 ; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 ; P9LE-NEXT: xxpermdi vs1, f1, f1, 2 ; P9LE-NEXT: xvsubsp vs0, vs0, vs1 ; P9LE-NEXT: xxsldwi vs0, vs0, vs0, 2 -; P9LE-NEXT: mr r3, r5 ; P9LE-NEXT: stfiwx f0, 0, r5 ; P9LE-NEXT: blr ; @@ -79,11 +79,11 @@ ; P9BE: # %bb.0: ; P9BE-NEXT: lfiwzx f0, 0, r3 ; P9BE-NEXT: lfiwzx f1, 0, r4 +; P9BE-NEXT: mr r3, r5 ; P9BE-NEXT: xxsldwi vs0, f0, f0, 1 ; P9BE-NEXT: xxsldwi vs1, f1, f1, 1 ; P9BE-NEXT: xvsubsp vs0, vs0, vs1 ; P9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 -; P9BE-NEXT: mr r3, r5 ; P9BE-NEXT: stfiwx f0, 0, r5 ; P9BE-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -216,18 +216,18 @@ ; P9LE-LABEL: s2v_test_f3: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r7, 2 -; P9LE-NEXT: lfiwzx f0, r3, r4 +; P9LE-DAG: lfiwzx [[F0:f0]], r3, r4 ; P9LE-DAG: xxspltw v2, v2, 2 -; P9LE-DAG: xxpermdi v3, f0, f0, 2 +; P9LE-DAG: xxpermdi v3, [[F0]], [[F0]], 2 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; P9BE-LABEL: s2v_test_f3: ; P9BE: # %bb.0: # %entry ; P9BE: sldi r4, r7, 2 -; P9BE: lfiwzx f0, r3, r4 +; P9BE-DAG: lfiwzx [[F0:f0]], r3, r4 ; P9BE-DAG: xxspltw v2, v2, 1 -; P9BE-DAG: xxsldwi v3, f0, f0, 1 +; P9BE-DAG: xxsldwi v3, [[F0]], [[F0]], 1 ; P9BE: vmrghw v2, v3, v2 ; P9BE-NEXT: blr @@ -261,18 +261,18 @@ ; P9LE-LABEL: s2v_test_f4: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: addi r3, r3, 4 -; P9LE-NEXT: lfiwzx f0, 0, r3 -; P9LE-DAG: xxspltw v2, v2, 2 -; P9LE-DAG: xxpermdi v3, f0, f0, 2 +; P9LE-DAG: lfiwzx [[F0:f0]], 0, r3 +; P9LE-DAG: xxspltw v2, v2, 2 +; P9LE-DAG: xxpermdi v3, [[F0]], [[F0]], 2 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; P9BE-LABEL: s2v_test_f4: ; P9BE: # %bb.0: # %entry ; P9BE: addi r3, r3, 4 -; P9BE: lfiwzx f0, 0, r3 +; P9BE-DAG: lfiwzx [[F0:f0]], 0, r3 ; P9BE-DAG: xxspltw v2, v2, 1 -; P9BE-DAG: xxsldwi v3, f0, f0, 1 +; P9BE-DAG: xxsldwi v3, [[F0]], [[F0]], 1 ; P9BE: vmrghw v2, v3, v2 ; P9BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll @@ -31,25 +31,25 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 7, 32 -; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: addi 8, 7, -1 +; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: cntlzw 6, 6 +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: srwi 7, 6, 5 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 8, 32 ; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: lbz 5, 0(5) -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .LBB0_3: # ; CHECK-NEXT: clrldi 10, 8, 32 ; CHECK-NEXT: cntlzw 9, 6 ; CHECK-NEXT: xori 6, 5, 84 -; CHECK-NEXT: lbz 5, 0(10) ; CHECK-NEXT: addi 8, 8, -1 +; CHECK-NEXT: lbz 5, 0(10) ; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: add 4, 4, 7 ; CHECK-NEXT: srwi 7, 9, 5 @@ -62,23 +62,23 @@ ; CHECK-NEXT: add 4, 4, 6 ; CHECK-NEXT: .LBB0_6: ; CHECK-NEXT: xori 5, 5, 84 -; CHECK-NEXT: cntlzw 5, 5 ; CHECK-NEXT: clrldi 3, 3, 32 +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: li 8, 3 ; CHECK-NEXT: std 3, 104(1) +; CHECK-NEXT: cntlzw 5, 5 ; CHECK-NEXT: addis 3, 2, .LC0@toc@ha +; CHECK-NEXT: li 10, 0 ; CHECK-NEXT: ld 3, .LC0@toc@l(3) -; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: li 8, 3 ; CHECK-NEXT: srwi 5, 5, 5 ; CHECK-NEXT: add 4, 4, 5 ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: std 5, 120(1) ; CHECK-NEXT: li 5, 3 -; CHECK-NEXT: std 5, 96(1) ; CHECK-NEXT: clrldi 6, 4, 32 ; CHECK-NEXT: li 4, 3 +; CHECK-NEXT: std 5, 96(1) ; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 10, 0 ; CHECK-NEXT: bl printf ; CHECK-NEXT: nop %1 = load i32, i32* undef, align 4 diff --git a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll --- a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll +++ b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll @@ -7,8 +7,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lha 3, 0(3) ; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: sldi 5, 5, 62 ; CHECK-NEXT: lhz 4, 0(0) +; CHECK-NEXT: sldi 5, 5, 62 ; CHECK-NEXT: mtctr 5 ; CHECK-NEXT: srawi 3, 3, 1 ; CHECK-NEXT: addze 3, 3 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: addi 7, 30, -4 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: lwzu 8, 4(7) ; CHECK-NEXT: addi 4, 29, -8 ; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: lwzu 8, 4(7) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: extswsli 6, 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/sms-simple.ll b/llvm/test/CodeGen/PowerPC/sms-simple.ll --- a/llvm/test/CodeGen/PowerPC/sms-simple.ll +++ b/llvm/test/CodeGen/PowerPC/sms-simple.ll @@ -10,17 +10,17 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, x@toc@ha -; CHECK-NEXT: addi r5, r5, x@toc@l ; CHECK-NEXT: addis r6, r2, y@toc@ha ; CHECK-NEXT: li r7, 340 +; CHECK-NEXT: addi r5, r5, x@toc@l ; CHECK-NEXT: addi r3, r6, y@toc@l ; CHECK-NEXT: lwz r6, y@toc@l(r6) ; CHECK-NEXT: mtctr r7 ; CHECK-NEXT: addi r5, r5, -8 +; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: lwzu r7, 12(r5) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 4(r5) -; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: stwu r6, 12(r4) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 8(r5) @@ -29,12 +29,12 @@ ; CHECK-NEXT: # ; CHECK-NEXT: maddld r7, r7, r7, r6 ; CHECK-NEXT: lwzu r8, 12(r5) -; CHECK-NEXT: maddld r8, r8, r8, r7 ; CHECK-NEXT: stw r6, 4(r4) ; CHECK-NEXT: lwz r6, 4(r5) -; CHECK-NEXT: maddld r6, r6, r6, r8 +; CHECK-NEXT: maddld r8, r8, r8, r7 ; CHECK-NEXT: stw r7, 8(r4) ; CHECK-NEXT: lwz r7, 8(r5) +; CHECK-NEXT: maddld r6, r6, r6, r8 ; CHECK-NEXT: stwu r8, 12(r4) ; CHECK-NEXT: bdnz .LBB0_1 ; CHECK-NEXT: # %bb.2: diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -12,10 +12,10 @@ ; P9LE-LABEL: fold_srem_vec_1: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -25,13 +25,14 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, 31710 ; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: ori r5, r5, 63421 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63421 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 ; P9LE-NEXT: subf r4, r4, r5 @@ -40,14 +41,15 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, 21399 ; P9LE-NEXT: mulli r4, r4, -124 +; P9LE-NEXT: ori r5, r5, 33437 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 33437 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r5, r4, 1, 63 ; P9LE-NEXT: rldicl r4, r4, 32, 32 @@ -55,14 +57,14 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, -16728 ; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: ori r5, r5, 63249 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63249 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r5, r4, 1, 63 ; P9LE-NEXT: rldicl r4, r4, 32, 32 @@ -70,8 +72,6 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, -1003 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 @@ -81,10 +81,10 @@ ; P9BE-LABEL: fold_srem_vec_1: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: lis r4, 31710 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 63421 +; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -96,12 +96,12 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 ; P9BE-NEXT: add r4, r4, r3 @@ -112,12 +112,13 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, -16728 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 63249 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 63249 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -127,13 +128,12 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, 21399 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -289,10 +289,10 @@ ; P9LE-LABEL: fold_srem_vec_2: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r6, r4, r5 ; P9LE-NEXT: rldicl r6, r6, 32, 32 @@ -305,6 +305,7 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r6, r4, r5 @@ -315,11 +316,12 @@ ; P9LE-NEXT: add r4, r4, r6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r6, r4, r5 ; P9LE-NEXT: rldicl r6, r6, 32, 32 @@ -329,10 +331,10 @@ ; P9LE-NEXT: add r4, r4, r6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 @@ -343,8 +345,6 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 @@ -354,10 +354,10 @@ ; P9BE-LABEL: fold_srem_vec_2: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: mulld r5, r3, r4 ; P9BE-NEXT: rldicl r5, r5, 32, 32 @@ -385,6 +385,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: mulld r5, r3, r4 @@ -396,7 +397,6 @@ ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: subf r3, r5, r3 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -552,10 +552,10 @@ ; P9LE-LABEL: combine_srem_sdiv: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r6, r4, r5 ; P9LE-NEXT: rldicl r6, r6, 32, 32 @@ -568,6 +568,7 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r6, r3 ; P9LE-NEXT: extsw r6, r6 ; P9LE-NEXT: mulld r7, r6, r5 @@ -578,11 +579,12 @@ ; P9LE-NEXT: add r6, r6, r7 ; P9LE-NEXT: mulli r7, r6, 95 ; P9LE-NEXT: subf r3, r7, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r7, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: extsw r7, r7 ; P9LE-NEXT: mulld r8, r7, r5 ; P9LE-NEXT: rldicl r8, r8, 32, 32 @@ -592,10 +594,10 @@ ; P9LE-NEXT: add r7, r7, r8 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: subf r3, r8, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r8, r3 ; P9LE-NEXT: extsw r8, r8 ; P9LE-NEXT: mulld r5, r8, r5 @@ -606,8 +608,6 @@ ; P9LE-NEXT: add r5, r5, r8 ; P9LE-NEXT: mulli r8, r5, 95 ; P9LE-NEXT: subf r3, r8, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: mtvsrd f0, r4 @@ -629,10 +629,10 @@ ; P9BE-LABEL: combine_srem_sdiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r4, r3 ; P9BE-NEXT: lis r5, -21386 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 37253 +; P9BE-NEXT: extsh r4, r3 ; P9BE-NEXT: extsw r4, r4 ; P9BE-NEXT: mulld r6, r4, r5 ; P9BE-NEXT: rldicl r6, r6, 32, 32 @@ -660,6 +660,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r7, r3 ; P9BE-NEXT: extsw r7, r7 ; P9BE-NEXT: mulld r8, r7, r5 @@ -671,7 +672,6 @@ ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: subf r3, r8, r3 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -864,7 +864,9 @@ ; P9LE-LABEL: dont_fold_srem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: lis r5, -21386 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: ori r5, r5, 37253 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: srawi r4, r4, 6 ; P9LE-NEXT: addze r4, r4 @@ -873,19 +875,17 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: srawi r4, r4, 5 ; P9LE-NEXT: addze r4, r4 ; P9LE-NEXT: slwi r4, r4, 5 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 ; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 @@ -899,12 +899,12 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: srawi r4, r4, 3 ; P9LE-NEXT: addze r4, r4 ; P9LE-NEXT: slwi r4, r4, 3 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v4, v2 @@ -931,12 +931,13 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 ; P9BE-NEXT: add r4, r4, r3 @@ -946,7 +947,6 @@ ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -1064,27 +1064,29 @@ ; P9LE-LABEL: dont_fold_srem_one: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: lis r5, -14230 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: add r4, r5, r4 ; P9LE-NEXT: srwi r5, r4, 31 ; P9LE-NEXT: srawi r4, r4, 9 ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, -19946 ; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: ori r5, r5, 17097 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: vmrglh v3, v3, v4 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 17097 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 ; P9LE-NEXT: add r4, r5, r4 @@ -1093,14 +1095,14 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, 24749 ; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: ori r5, r5, 47143 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r5, r4, 1, 63 ; P9LE-NEXT: rldicl r4, r4, 32, 32 @@ -1108,8 +1110,6 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 5423 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v3, v4 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 @@ -1119,10 +1119,10 @@ ; P9BE-LABEL: dont_fold_srem_one: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -1134,12 +1134,12 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, 24749 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -1149,12 +1149,13 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, -14230 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 30865 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 30865 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 ; P9BE-NEXT: add r4, r4, r3 @@ -1167,7 +1168,6 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1290,10 +1290,10 @@ ; P9LE-LABEL: dont_fold_urem_i16_smax: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 17097 +; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -1303,13 +1303,14 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: lis r5, 24749 ; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: ori r5, r5, 47143 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: extsh r4, r3 ; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r5, r4, 1, 63 ; P9LE-NEXT: rldicl r4, r4, 32, 32 @@ -1317,20 +1318,19 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 5423 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: srawi r4, r4, 15 ; P9LE-NEXT: addze r4, r4 ; P9LE-NEXT: slwi r4, r4, 15 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr @@ -1338,10 +1338,10 @@ ; P9BE-LABEL: dont_fold_urem_i16_smax: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -1353,12 +1353,12 @@ ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: lis r4, 24749 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mulld r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -1370,6 +1370,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: srawi r4, r3, 15 ; P9BE-NEXT: addze r4, r4 @@ -1379,7 +1380,6 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1488,10 +1488,10 @@ ; P9LE-LABEL: dont_fold_srem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 24749 +; P9LE-NEXT: mfvsrd r3, v3 ; P9LE-NEXT: ori r4, r4, 47142 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 58853 -; P9LE-NEXT: mfvsrd r3, v3 ; P9LE-NEXT: ori r4, r4, 6055 ; P9LE-NEXT: mulhd r4, r3, r4 ; P9LE-NEXT: rldicl r5, r4, 1, 63 @@ -1514,10 +1514,10 @@ ; P9LE-NEXT: sub r4, r4, r5 ; P9LE-NEXT: mtvsrdd v3, r3, r4 ; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: mfvsrd r3, v2 ; P9LE-NEXT: ori r4, r4, 15432 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 1603 -; P9LE-NEXT: mfvsrd r3, v2 ; P9LE-NEXT: ori r4, r4, 21445 ; P9LE-NEXT: mulhd r4, r3, r4 ; P9LE-NEXT: rldicl r5, r4, 1, 63 @@ -1532,10 +1532,10 @@ ; P9BE-LABEL: dont_fold_srem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: mfvsrld r3, v3 ; P9BE-NEXT: ori r4, r4, 47142 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 58853 -; P9BE-NEXT: mfvsrld r3, v3 ; P9BE-NEXT: ori r4, r4, 6055 ; P9BE-NEXT: mulhd r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 @@ -1558,10 +1558,10 @@ ; P9BE-NEXT: sub r4, r4, r5 ; P9BE-NEXT: mtvsrdd v3, r4, r3 ; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: mfvsrld r3, v2 ; P9BE-NEXT: ori r4, r4, 15432 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 1603 -; P9BE-NEXT: mfvsrld r3, v2 ; P9BE-NEXT: ori r4, r4, 21445 ; P9BE-NEXT: mulhd r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 diff --git a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir @@ -0,0 +1,18 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s +--- +# Check that postmisched's TopDepthReduce heuristic moves the MULLD later +# because of the dependency on x5 +name: test +body: | + bb.0: + ; CHECK-LABEL: name: test + ; CHECK: renamable $x5 = LD 0, killed renamable $x5 :: (load 8) + ; CHECK: renamable $x4 = LD 0, killed renamable $x4 :: (load 8) + ; CHECK: renamable $x5 = MULLD killed renamable $x5, renamable $x3 + ; CHECK: renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5 + renamable $x5 = LD 0, killed renamable $x5 :: (load 8) + renamable $x5 = MULLD killed renamable $x5, renamable $x3 + renamable $x4 = LD 0, killed renamable $x4 :: (load 8) + renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5 +... diff --git a/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll b/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll --- a/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll +++ b/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll @@ -20,9 +20,9 @@ ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: xscvuxddp f0, f0 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: mtfprwz f1, r3 -; P9BE-NEXT: xscvuxddp f0, f0 ; P9BE-NEXT: xscvuxddp f1, f1 ; P9BE-NEXT: xxmrghd v2, vs0, vs1 ; P9BE-NEXT: blr @@ -35,9 +35,9 @@ ; P9LE-NEXT: mtfprwz f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xscvuxddp f0, f0 ; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9LE-NEXT: mtfprwz f1, r3 -; P9LE-NEXT: xscvuxddp f0, f0 ; P9LE-NEXT: xscvuxddp f1, f1 ; P9LE-NEXT: xxmrghd v2, vs1, vs0 ; P9LE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll b/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll --- a/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll +++ b/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll @@ -6,8 +6,8 @@ ; CHECK-LABEL: test_xaddr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: ori r4, r4, 40000 ; CHECK-NEXT: std r3, -8(r1) +; CHECK-NEXT: ori r4, r4, 40000 ; CHECK-NEXT: lbzx r3, r3, r4 ; CHECK-NEXT: blr entry: @@ -56,8 +56,8 @@ ; CHECK-LABEL: test_xoaddr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi r3, r3, 8 -; CHECK-NEXT: lxvx vs0, 0, r3 ; CHECK-NEXT: addi r4, r4, 4 +; CHECK-NEXT: lxvx vs0, 0, r3 ; CHECK-NEXT: stxvx vs0, 0, r4 ; CHECK-NEXT: blr entry: @@ -77,9 +77,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: li r5, 3 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: li r5, 3 ; loop instruction number is changed from 5 to 4, so its align is changed from 5 to 4. ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: # %for.body diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -12,9 +12,11 @@ ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r5, 21399 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r5, r5, 33437 +; P9LE-NEXT: ori r6, r6, 8969 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: lis r5, 16727 @@ -25,6 +27,7 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: lis r5, 8456 @@ -32,24 +35,21 @@ ; P9LE-NEXT: rldicl r4, r4, 24, 40 ; P9LE-NEXT: mulli r4, r4, 1003 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r4, r4, 30, 34 ; P9LE-NEXT: mulli r4, r4, 124 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 -; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -68,10 +68,10 @@ ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: lis r5, 16727 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 2287 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r4, r3, 32 ; P9BE-NEXT: mulld r4, r4, r5 ; P9BE-NEXT: lis r5, 21399 @@ -95,6 +95,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r4, r3, 16 ; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 ; P9BE-NEXT: mulld r3, r3, r5 @@ -104,7 +105,6 @@ ; P9BE-NEXT: mulli r3, r3, 124 ; P9BE-NEXT: subf r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -241,10 +241,10 @@ ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -257,6 +257,7 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 @@ -267,11 +268,12 @@ ; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -281,10 +283,10 @@ ; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 @@ -295,8 +297,6 @@ ; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 @@ -306,10 +306,10 @@ ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r4, r3, 32 ; P9BE-NEXT: mulld r4, r4, r5 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -337,6 +337,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r4, r3, 32 ; P9BE-NEXT: mulld r4, r4, r5 @@ -348,7 +349,6 @@ ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -506,10 +506,10 @@ ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 ; P9LE-NEXT: rldicl r5, r5, 32, 32 @@ -522,6 +522,7 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: rlwinm r5, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r7, r5, 32 ; P9LE-NEXT: mulld r7, r7, r6 @@ -532,11 +533,12 @@ ; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r7, r5, 95 ; P9LE-NEXT: subf r3, r7, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r7, r3, 0, 16, 31 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: clrldi r8, r7, 32 ; P9LE-NEXT: mulld r8, r8, r6 ; P9LE-NEXT: rldicl r8, r8, 32, 32 @@ -546,10 +548,10 @@ ; P9LE-NEXT: srwi r7, r7, 6 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: subf r3, r8, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r8, r3, 0, 16, 31 ; P9LE-NEXT: clrldi r9, r8, 32 ; P9LE-NEXT: mulld r6, r9, r6 @@ -560,8 +562,6 @@ ; P9LE-NEXT: srwi r6, r6, 6 ; P9LE-NEXT: mulli r8, r6, 95 ; P9LE-NEXT: subf r3, r8, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: mtvsrd f0, r4 @@ -583,10 +583,10 @@ ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9BE-NEXT: lis r6, 22765 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r6, r6, 8969 +; P9BE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r5, r4, 32 ; P9BE-NEXT: mulld r5, r5, r6 ; P9BE-NEXT: rldicl r5, r5, 32, 32 @@ -614,6 +614,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: rlwinm r7, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r8, r7, 32 ; P9BE-NEXT: mulld r8, r8, r6 @@ -625,7 +626,6 @@ ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: subf r3, r8, r3 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -822,20 +822,20 @@ ; P9LE-LABEL: dont_fold_urem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: lis r6, 22765 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: ori r6, r6, 8969 ; P9LE-NEXT: rlwinm r3, r3, 0, 26, 31 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 ; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: clrldi r5, r4, 32 ; P9LE-NEXT: mulld r5, r5, r6 @@ -849,8 +849,8 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 ; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 ; P9LE-NEXT: vmrglh v2, v4, v2 @@ -860,7 +860,9 @@ ; P9BE-LABEL: dont_fold_urem_power_of_two: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: lis r5, 22765 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: ori r5, r5, 8969 ; P9BE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 @@ -871,10 +873,8 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 ; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r4, r3, 32 ; P9BE-NEXT: mulld r4, r4, r5 ; P9BE-NEXT: rldicl r4, r4, 32, 32 @@ -990,25 +990,25 @@ ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 ; P9LE-NEXT: mulld r4, r4, r6 ; P9LE-NEXT: rldicl r4, r4, 21, 43 ; P9LE-NEXT: mulli r4, r4, 5423 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: mulld r4, r4, r5 ; P9LE-NEXT: rldicl r4, r4, 24, 40 ; P9LE-NEXT: mulli r4, r4, 654 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr @@ -1016,10 +1016,10 @@ ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: lis r5, 24749 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 47143 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 ; P9BE-NEXT: clrldi r4, r3, 32 ; P9BE-NEXT: mulld r4, r4, r5 ; P9BE-NEXT: li r5, 0 @@ -1044,6 +1044,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r4, r3, 16 ; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 ; P9BE-NEXT: mulld r3, r3, r5 @@ -1054,7 +1055,6 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1163,10 +1163,10 @@ ; P9LE-LABEL: dont_fold_urem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 25644 +; P9LE-NEXT: mfvsrld r3, v3 ; P9LE-NEXT: ori r4, r4, 34192 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 45590 -; P9LE-NEXT: mfvsrld r3, v3 ; P9LE-NEXT: ori r4, r4, 17097 ; P9LE-NEXT: mulhdu r4, r3, r4 ; P9LE-NEXT: sub r5, r3, r4 @@ -1187,9 +1187,9 @@ ; P9LE-NEXT: sub r4, r4, r5 ; P9LE-NEXT: lis r5, 25653 ; P9LE-NEXT: ori r5, r5, 15432 -; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: mtvsrdd v3, r4, r3 ; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: rldicl r4, r3, 63, 1 ; P9LE-NEXT: oris r5, r5, 1603 ; P9LE-NEXT: ori r5, r5, 21445 @@ -1204,10 +1204,10 @@ ; P9BE-LABEL: dont_fold_urem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 25644 +; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 34192 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 45590 -; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: mulhdu r4, r3, r4 ; P9BE-NEXT: sub r5, r3, r4 @@ -1215,8 +1215,8 @@ ; P9BE-NEXT: add r4, r5, r4 ; P9BE-NEXT: lis r5, -16037 ; P9BE-NEXT: rldicl r4, r4, 60, 4 -; P9BE-NEXT: mulli r4, r4, 23 ; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: mulli r4, r4, 23 ; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: oris r5, r5, 52170 ; P9BE-NEXT: ori r5, r5, 12109 @@ -1228,9 +1228,9 @@ ; P9BE-NEXT: sub r4, r4, r5 ; P9BE-NEXT: lis r5, 25653 ; P9BE-NEXT: ori r5, r5, 15432 -; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: mtvsrdd v3, r3, r4 ; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: rldicl r4, r3, 63, 1 ; P9BE-NEXT: oris r5, r5, 1603 ; P9BE-NEXT: ori r5, r5, 21445 diff --git a/llvm/test/CodeGen/PowerPC/vavg.ll b/llvm/test/CodeGen/PowerPC/vavg.ll --- a/llvm/test/CodeGen/PowerPC/vavg.ll +++ b/llvm/test/CodeGen/PowerPC/vavg.ll @@ -138,8 +138,8 @@ ; CHECK-P9-LABEL: test_v8i16_sign_negative: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l ; CHECK-P9-NEXT: vadduhm 2, 2, 3 +; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l ; CHECK-P9-NEXT: lxvx 35, 0, 3 ; CHECK-P9-NEXT: vadduhm 2, 2, 3 ; CHECK-P9-NEXT: vspltish 3, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec-bswap.ll b/llvm/test/CodeGen/PowerPC/vec-bswap.ll --- a/llvm/test/CodeGen/PowerPC/vec-bswap.ll +++ b/llvm/test/CodeGen/PowerPC/vec-bswap.ll @@ -3,7 +3,8 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) { ; CHECK-LABEL: test: ; CHECK: lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: xxbrw vs{{[0-9]+}}, [[REG]] +; CHECK-NOT: [[REG]] +; CHECK: xxbrw vs{{[0-9]+}}, [[REG]] entry: %cmp1 = icmp slt i32 0, %Len br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll @@ -36,17 +36,17 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: vextuwrx r3, r3, v2 ; CHECK-P9-NEXT: blr ; @@ -124,9 +124,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, v2 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -239,10 +239,10 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v2, vs2 @@ -256,18 +256,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -282,13 +282,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 @@ -298,10 +298,10 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 48 @@ -326,14 +326,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghh v3, v3, v4 +; CHECK-BE-NEXT: vmrghw v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghh v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 48 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -483,50 +483,50 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: xscvspdpn f5, vs1 ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f8, vs3 ; CHECK-P9-NEXT: xxswapd vs4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 +; CHECK-P9-NEXT: xxswapd vs7, vs3 +; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-P9-NEXT: xxsldwi vs9, vs0, vs0, 3 +; CHECK-P9-NEXT: xxswapd vs10, vs0 ; CHECK-P9-NEXT: xscvdpsxws f5, f5 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f8, f8 -; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 -; CHECK-P9-NEXT: xxswapd vs7, vs3 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 ; CHECK-P9-NEXT: xscvspdpn f7, vs7 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvspdpn f9, vs9 +; CHECK-P9-NEXT: xscvspdpn f10, vs10 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvdpsxws f6, f6 -; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f6, f6 ; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f9, f9 +; CHECK-P9-NEXT: xscvdpsxws f10, f10 +; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: mtvsrd f5, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f8 ; CHECK-P9-NEXT: mtvsrd f8, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f2 -; CHECK-P9-NEXT: lxv vs0, 32(r4) -; CHECK-P9-NEXT: xxsldwi vs9, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs10, vs0 -; CHECK-P9-NEXT: xscvspdpn f9, vs9 -; CHECK-P9-NEXT: xscvspdpn f10, vs10 -; CHECK-P9-NEXT: xscvdpsxws f9, f9 -; CHECK-P9-NEXT: xscvdpsxws f10, f10 ; CHECK-P9-NEXT: mtvsrd f2, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f4 ; CHECK-P9-NEXT: mtvsrd f4, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f1 -; CHECK-P9-NEXT: mtvsrd f1, r5 -; CHECK-P9-NEXT: mfvsrwz r5, f6 ; CHECK-P9-NEXT: xxswapd v2, vs2 -; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: xscvspdpn f2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: mtvsrd f1, r5 +; CHECK-P9-NEXT: mfvsrwz r5, f6 +; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mtvsrd f6, r5 @@ -537,27 +537,37 @@ ; CHECK-P9-NEXT: xxswapd v3, vs5 ; CHECK-P9-NEXT: mtvsrd f7, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs6 ; CHECK-P9-NEXT: xxswapd v5, vs7 ; CHECK-P9-NEXT: mtvsrd f3, r5 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mfvsrwz r5, f9 ; CHECK-P9-NEXT: xxswapd v0, vs3 +; CHECK-P9-NEXT: mfvsrwz r4, f2 +; CHECK-P9-NEXT: mtvsrd f9, r5 +; CHECK-P9-NEXT: mfvsrwz r5, f10 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 ; CHECK-P9-NEXT: xxswapd v5, vs8 -; CHECK-P9-NEXT: vmrglh v5, v5, v0 -; CHECK-P9-NEXT: mfvsrwz r4, f2 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mtvsrd f2, r4 +; CHECK-P9-NEXT: mtvsrd f10, r5 ; CHECK-P9-NEXT: mfvsrwz r4, f0 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 +; CHECK-P9-NEXT: vmrglh v5, v5, v0 +; CHECK-P9-NEXT: xxswapd v0, vs9 +; CHECK-P9-NEXT: xxswapd v1, vs10 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: vmrglw v3, v5, v4 ; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: vmrglh v0, v1, v0 ; CHECK-P9-NEXT: xxmrgld vs2, v3, v2 ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 +; CHECK-P9-NEXT: vmrglh v2, v4, v2 +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v3, vs0 @@ -566,26 +576,16 @@ ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 -; CHECK-P9-NEXT: vmrglh v2, v4, v2 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs1 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r5, f9 -; CHECK-P9-NEXT: mtvsrd f9, r5 -; CHECK-P9-NEXT: mfvsrwz r5, f10 -; CHECK-P9-NEXT: mtvsrd f10, r5 -; CHECK-P9-NEXT: xxswapd v0, vs9 -; CHECK-P9-NEXT: xxswapd v1, vs10 -; CHECK-P9-NEXT: vmrglh v0, v1, v0 -; CHECK-P9-NEXT: vmrglw v2, v2, v0 -; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v5, vs0 @@ -598,25 +598,26 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r4) +; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxswapd vs3, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: mfvsrwz r5, f2 ; CHECK-BE-NEXT: xscvspdpn f4, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mfvsrwz r5, f2 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f3 ; CHECK-BE-NEXT: xscvdpsxws f3, f4 -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r5, f3 @@ -624,7 +625,6 @@ ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r5 @@ -651,24 +651,24 @@ ; CHECK-BE-NEXT: lxv vs0, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f5, vs1 ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xxswapd vs3, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v0, r5 -; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: mfvsrwz r4, f5 ; CHECK-BE-NEXT: xxmrghd vs4, v3, v2 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f2 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v3, r4 @@ -678,18 +678,18 @@ ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r4, r4, 48 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: mtvsrd v4, r4 -; CHECK-BE-NEXT: mfvsrwz r4, f1 -; CHECK-BE-NEXT: xxswapd vs1, vs0 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v2, v2, v4 -; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-BE-NEXT: mfvsrwz r4, f1 +; CHECK-BE-NEXT: xxswapd vs1, vs0 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r4 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -744,17 +744,17 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: vextuwrx r3, r3, v2 ; CHECK-P9-NEXT: blr ; @@ -832,9 +832,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, v2 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -947,10 +947,10 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v2, vs2 @@ -964,18 +964,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -990,13 +990,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 @@ -1006,10 +1006,10 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 48 @@ -1034,14 +1034,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghh v3, v3, v4 +; CHECK-BE-NEXT: vmrghw v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghh v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 48 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -1191,50 +1191,50 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: xscvspdpn f5, vs1 ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f8, vs3 ; CHECK-P9-NEXT: xxswapd vs4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 +; CHECK-P9-NEXT: xxswapd vs7, vs3 +; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-P9-NEXT: xxsldwi vs9, vs0, vs0, 3 +; CHECK-P9-NEXT: xxswapd vs10, vs0 ; CHECK-P9-NEXT: xscvdpsxws f5, f5 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f8, f8 -; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 -; CHECK-P9-NEXT: xxswapd vs7, vs3 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 ; CHECK-P9-NEXT: xscvspdpn f7, vs7 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvspdpn f9, vs9 +; CHECK-P9-NEXT: xscvspdpn f10, vs10 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvdpsxws f6, f6 -; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f6, f6 ; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f9, f9 +; CHECK-P9-NEXT: xscvdpsxws f10, f10 +; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: mtvsrd f5, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f8 ; CHECK-P9-NEXT: mtvsrd f8, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f2 -; CHECK-P9-NEXT: lxv vs0, 32(r4) -; CHECK-P9-NEXT: xxsldwi vs9, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs10, vs0 -; CHECK-P9-NEXT: xscvspdpn f9, vs9 -; CHECK-P9-NEXT: xscvspdpn f10, vs10 -; CHECK-P9-NEXT: xscvdpsxws f9, f9 -; CHECK-P9-NEXT: xscvdpsxws f10, f10 ; CHECK-P9-NEXT: mtvsrd f2, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f4 ; CHECK-P9-NEXT: mtvsrd f4, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f1 -; CHECK-P9-NEXT: mtvsrd f1, r5 -; CHECK-P9-NEXT: mfvsrwz r5, f6 ; CHECK-P9-NEXT: xxswapd v2, vs2 -; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: xscvspdpn f2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: mtvsrd f1, r5 +; CHECK-P9-NEXT: mfvsrwz r5, f6 +; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mtvsrd f6, r5 @@ -1245,27 +1245,37 @@ ; CHECK-P9-NEXT: xxswapd v3, vs5 ; CHECK-P9-NEXT: mtvsrd f7, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs6 ; CHECK-P9-NEXT: xxswapd v5, vs7 ; CHECK-P9-NEXT: mtvsrd f3, r5 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mfvsrwz r5, f9 ; CHECK-P9-NEXT: xxswapd v0, vs3 +; CHECK-P9-NEXT: mfvsrwz r4, f2 +; CHECK-P9-NEXT: mtvsrd f9, r5 +; CHECK-P9-NEXT: mfvsrwz r5, f10 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 ; CHECK-P9-NEXT: xxswapd v5, vs8 -; CHECK-P9-NEXT: vmrglh v5, v5, v0 -; CHECK-P9-NEXT: mfvsrwz r4, f2 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mtvsrd f2, r4 +; CHECK-P9-NEXT: mtvsrd f10, r5 ; CHECK-P9-NEXT: mfvsrwz r4, f0 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 +; CHECK-P9-NEXT: vmrglh v5, v5, v0 +; CHECK-P9-NEXT: xxswapd v0, vs9 +; CHECK-P9-NEXT: xxswapd v1, vs10 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: vmrglw v3, v5, v4 ; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: vmrglh v0, v1, v0 ; CHECK-P9-NEXT: xxmrgld vs2, v3, v2 ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 +; CHECK-P9-NEXT: vmrglh v2, v4, v2 +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v3, vs0 @@ -1274,26 +1284,16 @@ ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 -; CHECK-P9-NEXT: vmrglh v2, v4, v2 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs1 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r5, f9 -; CHECK-P9-NEXT: mtvsrd f9, r5 -; CHECK-P9-NEXT: mfvsrwz r5, f10 -; CHECK-P9-NEXT: mtvsrd f10, r5 -; CHECK-P9-NEXT: xxswapd v0, vs9 -; CHECK-P9-NEXT: xxswapd v1, vs10 -; CHECK-P9-NEXT: vmrglh v0, v1, v0 -; CHECK-P9-NEXT: vmrglw v2, v2, v0 -; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: mfvsrwz r4, f0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v5, vs0 @@ -1306,25 +1306,26 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r4) +; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxswapd vs3, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: mfvsrwz r5, f2 ; CHECK-BE-NEXT: xscvspdpn f4, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mfvsrwz r5, f2 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f3 ; CHECK-BE-NEXT: xscvdpsxws f3, f4 -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r5, f3 @@ -1332,7 +1333,6 @@ ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r5 @@ -1359,24 +1359,24 @@ ; CHECK-BE-NEXT: lxv vs0, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f5, vs1 ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xxswapd vs3, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v0, r5 -; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: mfvsrwz r4, f5 ; CHECK-BE-NEXT: xxmrghd vs4, v3, v2 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f2 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v3, r4 @@ -1386,18 +1386,18 @@ ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r4, r4, 48 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: mtvsrd v4, r4 -; CHECK-BE-NEXT: mfvsrwz r4, f1 -; CHECK-BE-NEXT: xxswapd vs1, vs0 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v2, v2, v4 -; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-BE-NEXT: mfvsrwz r4, f1 +; CHECK-BE-NEXT: xxswapd vs1, vs0 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r4 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll @@ -91,10 +91,10 @@ ; CHECK-BE-NEXT: xscvspdpn f0, v2 ; CHECK-BE-NEXT: xxswapd vs2, v2 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-BE-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 @@ -152,10 +152,10 @@ ; CHECK-P9-NEXT: lxv vs0, 0(r4) ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvspdpn f3, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-P9-NEXT: lxv vs2, 16(r4) @@ -167,6 +167,7 @@ ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs4 ; CHECK-P9-NEXT: stxv vs0, 16(r3) +; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: xxmrghd vs3, vs4, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -176,39 +177,38 @@ ; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: stxv vs3, 32(r3) ; CHECK-P9-NEXT: stxv vs2, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs0, 16(r4) ; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: lxv vs0, 16(r4) ; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 1 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 ; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 3 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs3 ; CHECK-BE-NEXT: xscvspdpn f3, vs0 ; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 3 ; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-BE-NEXT: stxv vs2, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvspdpn f0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-BE-NEXT: stxv vs3, 32(r3) ; CHECK-BE-NEXT: stxv vs0, 48(r3) -; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <8 x float>, <8 x float>* %0, align 32 @@ -296,121 +296,121 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs4, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 0(r4) +; CHECK-P9-NEXT: lxv vs3, 32(r4) ; CHECK-P9-NEXT: xxsldwi vs5, vs4, vs4, 3 ; CHECK-P9-NEXT: xxswapd vs6, vs4 -; CHECK-P9-NEXT: lxv vs0, 0(r4) ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xxswapd vs2, vs0 +; CHECK-P9-NEXT: xxswapd vs7, vs3 ; CHECK-P9-NEXT: xscvspdpn f5, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvspdpn f7, vs7 ; CHECK-P9-NEXT: xxmrghd vs5, vs6, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs4 ; CHECK-P9-NEXT: xxsldwi vs4, vs4, vs4, 1 -; CHECK-P9-NEXT: lxv vs3, 32(r4) -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxswapd vs7, vs3 -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-P9-NEXT: xscvspdpn f2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 ; CHECK-P9-NEXT: xxmrghd vs4, vs6, vs4 ; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-P9-NEXT: lxv vs2, 48(r4) ; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 +; CHECK-P9-NEXT: stxv vs5, 32(r3) +; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: xxswapd vs8, vs2 ; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xscvspdpn f7, vs3 ; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-P9-NEXT: lxv vs2, 48(r4) -; CHECK-P9-NEXT: xxswapd vs8, vs2 ; CHECK-P9-NEXT: xscvspdpn f8, vs8 -; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-P9-NEXT: stxv vs4, 48(r3) +; CHECK-P9-NEXT: stxv vs0, 16(r3) ; CHECK-P9-NEXT: xxmrghd vs3, vs7, vs3 ; CHECK-P9-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-P9-NEXT: stxv vs6, 64(r3) ; CHECK-P9-NEXT: xscvspdpn f7, vs7 +; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-P9-NEXT: stxv vs6, 64(r3) ; CHECK-P9-NEXT: xxmrghd vs7, vs8, vs7 ; CHECK-P9-NEXT: xscvspdpn f8, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 -; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 ; CHECK-P9-NEXT: xvcvdpuxds vs7, vs7 ; CHECK-P9-NEXT: stxv vs3, 80(r3) +; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: stxv vs7, 96(r3) ; CHECK-P9-NEXT: stxv vs2, 112(r3) -; CHECK-P9-NEXT: stxv vs4, 48(r3) -; CHECK-P9-NEXT: stxv vs5, 32(r3) -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: lxv vs4, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f1, vs0 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 ; CHECK-BE-NEXT: xxsldwi vs5, vs0, vs0, 3 ; CHECK-BE-NEXT: xxswapd vs0, vs0 +; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 ; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f6, vs6 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs5 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 +; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-BE-NEXT: lxv vs2, 48(r4) ; CHECK-BE-NEXT: xxmrghd vs5, vs5, vs6 ; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 3 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: lxv vs2, 48(r4) -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-BE-NEXT: xscvspdpn f6, vs6 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 ; CHECK-BE-NEXT: xxmrghd vs4, vs4, vs6 ; CHECK-BE-NEXT: xscvspdpn f6, vs3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) +; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: xxmrghd vs6, vs6, vs7 ; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 +; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 ; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs7 ; CHECK-BE-NEXT: xscvspdpn f7, vs2 +; CHECK-BE-NEXT: stxv vs4, 48(r3) ; CHECK-BE-NEXT: xxmrghd vs7, vs7, vs8 ; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 3 ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: stxv vs6, 64(r3) ; CHECK-BE-NEXT: xscvspdpn f8, vs8 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 -; CHECK-BE-NEXT: stxv vs5, 32(r3) -; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 ; CHECK-BE-NEXT: xvcvdpuxds vs7, vs7 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 ; CHECK-BE-NEXT: stxv vs3, 80(r3) -; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: stxv vs2, 112(r3) -; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs4, 48(r3) -; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <16 x float>, <16 x float>* %0, align 64 @@ -501,10 +501,10 @@ ; CHECK-BE-NEXT: xscvspdpn f0, v2 ; CHECK-BE-NEXT: xxswapd vs2, v2 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-BE-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 @@ -562,10 +562,10 @@ ; CHECK-P9-NEXT: lxv vs0, 0(r4) ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvspdpn f3, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-P9-NEXT: lxv vs2, 16(r4) @@ -577,6 +577,7 @@ ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs4 ; CHECK-P9-NEXT: stxv vs0, 16(r3) +; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: xxmrghd vs3, vs4, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -586,39 +587,38 @@ ; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: stxv vs3, 32(r3) ; CHECK-P9-NEXT: stxv vs2, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs0, 16(r4) ; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: lxv vs0, 16(r4) ; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 1 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 ; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 3 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs3 ; CHECK-BE-NEXT: xscvspdpn f3, vs0 ; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 3 ; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-BE-NEXT: stxv vs2, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvspdpn f0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 ; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-BE-NEXT: stxv vs3, 32(r3) ; CHECK-BE-NEXT: stxv vs0, 48(r3) -; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <8 x float>, <8 x float>* %0, align 32 @@ -706,121 +706,121 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs4, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 0(r4) +; CHECK-P9-NEXT: lxv vs3, 32(r4) ; CHECK-P9-NEXT: xxsldwi vs5, vs4, vs4, 3 ; CHECK-P9-NEXT: xxswapd vs6, vs4 -; CHECK-P9-NEXT: lxv vs0, 0(r4) ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xxswapd vs2, vs0 +; CHECK-P9-NEXT: xxswapd vs7, vs3 ; CHECK-P9-NEXT: xscvspdpn f5, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvspdpn f7, vs7 ; CHECK-P9-NEXT: xxmrghd vs5, vs6, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs4 ; CHECK-P9-NEXT: xxsldwi vs4, vs4, vs4, 1 -; CHECK-P9-NEXT: lxv vs3, 32(r4) -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxswapd vs7, vs3 -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-P9-NEXT: xscvspdpn f2, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 ; CHECK-P9-NEXT: xxmrghd vs4, vs6, vs4 ; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-P9-NEXT: lxv vs2, 48(r4) ; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-P9-NEXT: xscvspdpn f6, vs6 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 +; CHECK-P9-NEXT: stxv vs5, 32(r3) +; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: xxswapd vs8, vs2 ; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xscvspdpn f7, vs3 ; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-P9-NEXT: lxv vs2, 48(r4) -; CHECK-P9-NEXT: xxswapd vs8, vs2 ; CHECK-P9-NEXT: xscvspdpn f8, vs8 -; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-P9-NEXT: stxv vs4, 48(r3) +; CHECK-P9-NEXT: stxv vs0, 16(r3) ; CHECK-P9-NEXT: xxmrghd vs3, vs7, vs3 ; CHECK-P9-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-P9-NEXT: stxv vs6, 64(r3) ; CHECK-P9-NEXT: xscvspdpn f7, vs7 +; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-P9-NEXT: stxv vs6, 64(r3) ; CHECK-P9-NEXT: xxmrghd vs7, vs8, vs7 ; CHECK-P9-NEXT: xscvspdpn f8, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 -; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 ; CHECK-P9-NEXT: xvcvdpuxds vs7, vs7 ; CHECK-P9-NEXT: stxv vs3, 80(r3) +; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: stxv vs7, 96(r3) ; CHECK-P9-NEXT: stxv vs2, 112(r3) -; CHECK-P9-NEXT: stxv vs4, 48(r3) -; CHECK-P9-NEXT: stxv vs5, 32(r3) -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: lxv vs4, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f1, vs0 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 ; CHECK-BE-NEXT: xxsldwi vs5, vs0, vs0, 3 ; CHECK-BE-NEXT: xxswapd vs0, vs0 +; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 ; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f6, vs6 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs5 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 +; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-BE-NEXT: lxv vs2, 48(r4) ; CHECK-BE-NEXT: xxmrghd vs5, vs5, vs6 ; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 3 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: lxv vs2, 48(r4) -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 +; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-BE-NEXT: xscvspdpn f6, vs6 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 ; CHECK-BE-NEXT: xxmrghd vs4, vs4, vs6 ; CHECK-BE-NEXT: xscvspdpn f6, vs3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) +; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: xxmrghd vs6, vs6, vs7 ; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 +; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 ; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs7 ; CHECK-BE-NEXT: xscvspdpn f7, vs2 +; CHECK-BE-NEXT: stxv vs4, 48(r3) ; CHECK-BE-NEXT: xxmrghd vs7, vs7, vs8 ; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 3 ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: stxv vs6, 64(r3) ; CHECK-BE-NEXT: xscvspdpn f8, vs8 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 -; CHECK-BE-NEXT: stxv vs5, 32(r3) -; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 ; CHECK-BE-NEXT: xvcvdpuxds vs7, vs7 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 ; CHECK-BE-NEXT: stxv vs3, 80(r3) -; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: stxv vs2, 112(r3) -; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs4, 48(r3) -; CHECK-BE-NEXT: stxv vs1, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <16 x float>, <16 x float>* %0, align 64 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -39,18 +39,18 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8 -; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: stxsihx v2, 0, r3 ; CHECK-P9-NEXT: lhz r3, -2(r1) ; CHECK-P9-NEXT: blr @@ -131,9 +131,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, v2 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -250,10 +250,10 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v2, vs2 @@ -267,18 +267,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -293,13 +293,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 @@ -310,10 +310,10 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -338,14 +338,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -495,12 +495,12 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r3) -; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs4, 16(r3) +; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: xxswapd v2, vs3 @@ -514,18 +514,18 @@ ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 @@ -538,9 +538,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvspdpn f2, vs4 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 @@ -550,8 +550,8 @@ ; CHECK-P9-NEXT: xxswapd v5, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 @@ -567,18 +567,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -593,13 +593,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v4, v5, v4 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v4, v5, v4 ; CHECK-P9-NEXT: xxswapd v5, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 @@ -610,12 +610,12 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs3 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -640,14 +640,14 @@ ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs2 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f3 ; CHECK-BE-NEXT: xscvspdpn f3, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -666,15 +666,15 @@ ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: mfvsrwz r3, f2 -; CHECK-BE-NEXT: xxswapd vs2, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: mfvsrwz r3, f2 +; CHECK-BE-NEXT: xxswapd vs2, vs1 +; CHECK-BE-NEXT: sldi r3, r3, 56 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 @@ -693,14 +693,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v4, v4, v5 +; CHECK-BE-NEXT: vmrghh v3, v4, v3 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v3, v4, v3 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -757,18 +757,18 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: xxswapd v3, vs0 ; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8 -; CHECK-P9-NEXT: addi r3, r1, -2 ; CHECK-P9-NEXT: stxsihx v2, 0, r3 ; CHECK-P9-NEXT: lhz r3, -2(r1) ; CHECK-P9-NEXT: blr @@ -849,9 +849,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, v2 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -968,10 +968,10 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v2, vs2 @@ -985,18 +985,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -1011,13 +1011,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 @@ -1028,10 +1028,10 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -1056,14 +1056,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -1213,12 +1213,12 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r3) -; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs4, 16(r3) +; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: xxswapd v2, vs3 @@ -1232,18 +1232,18 @@ ; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 @@ -1256,9 +1256,9 @@ ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvspdpn f2, vs4 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 @@ -1268,8 +1268,8 @@ ; CHECK-P9-NEXT: xxswapd v5, vs2 ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 @@ -1285,18 +1285,18 @@ ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 @@ -1311,13 +1311,13 @@ ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v4, v5, v4 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v4, v5, v4 ; CHECK-P9-NEXT: xxswapd v5, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 @@ -1328,12 +1328,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs3 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -1358,14 +1358,14 @@ ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mfvsrwz r3, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs2 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f3 ; CHECK-BE-NEXT: xscvspdpn f3, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -1384,15 +1384,15 @@ ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: mfvsrwz r3, f2 -; CHECK-BE-NEXT: xxswapd vs2, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: mfvsrwz r3, f2 +; CHECK-BE-NEXT: xxswapd vs2, vs1 +; CHECK-BE-NEXT: sldi r3, r3, 56 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 @@ -1411,14 +1411,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v4, v4, v5 +; CHECK-BE-NEXT: vmrghh v3, v4, v3 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v3, v4, v3 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mfvsrwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -97,10 +97,10 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 @@ -110,12 +110,12 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 @@ -125,10 +125,10 @@ ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -214,12 +214,12 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 @@ -229,33 +229,33 @@ ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs3 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 @@ -265,12 +265,12 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -431,128 +431,128 @@ ; CHECK-P9-NEXT: lxv vs4, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) ; CHECK-P9-NEXT: lxv vs2, 32(r4) -; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: xscvdpsxws f6, f3 ; CHECK-P9-NEXT: lxv vs0, 64(r4) ; CHECK-P9-NEXT: xscvdpsxws f7, f2 ; CHECK-P9-NEXT: xscvdpsxws f8, f1 ; CHECK-P9-NEXT: xxswapd vs4, vs4 -; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: xscvdpsxws f9, f0 ; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xxswapd vs2, vs2 +; CHECK-P9-NEXT: xxswapd vs1, vs1 +; CHECK-P9-NEXT: xxswapd vs0, vs0 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: mtvsrd f5, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f6 -; CHECK-P9-NEXT: xxswapd vs2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mtvsrd f6, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f7 +; CHECK-P9-NEXT: xxswapd v2, vs5 ; CHECK-P9-NEXT: mtvsrd f7, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f8 ; CHECK-P9-NEXT: mtvsrd f8, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f9 ; CHECK-P9-NEXT: mtvsrd f9, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f4 +; CHECK-P9-NEXT: xxswapd v5, vs8 ; CHECK-P9-NEXT: mtvsrd f4, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f3 -; CHECK-P9-NEXT: xxswapd vs1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xxswapd v2, vs5 -; CHECK-P9-NEXT: xxswapd v5, vs8 ; CHECK-P9-NEXT: xxswapd v0, vs9 ; CHECK-P9-NEXT: mtvsrd f3, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f2 -; CHECK-P9-NEXT: mtvsrd f2, r5 -; CHECK-P9-NEXT: xxswapd vs0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: xxswapd v1, vs2 -; CHECK-P9-NEXT: lxv vs2, 80(r4) ; CHECK-P9-NEXT: xxswapd v3, vs4 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs6 +; CHECK-P9-NEXT: mtvsrd f2, r5 ; CHECK-P9-NEXT: xxswapd v4, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f2 -; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: mfvsrwz r5, f1 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs7 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-P9-NEXT: xxswapd v3, vs6 +; CHECK-P9-NEXT: xxswapd v1, vs2 +; CHECK-P9-NEXT: lxv vs2, 80(r4) ; CHECK-P9-NEXT: mtvsrd f1, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: xxswapd v4, vs7 +; CHECK-P9-NEXT: mtvsrd f0, r5 +; CHECK-P9-NEXT: xscvdpsxws f3, f2 +; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: vmrglh v4, v4, v1 ; CHECK-P9-NEXT: xxswapd v1, vs1 -; CHECK-P9-NEXT: mtvsrd f0, r5 -; CHECK-P9-NEXT: vmrglh v5, v5, v1 +; CHECK-P9-NEXT: lxv vs1, 96(r4) +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v5, v5, v1 ; CHECK-P9-NEXT: xxswapd v1, vs0 ; CHECK-P9-NEXT: lxv vs0, 112(r4) -; CHECK-P9-NEXT: lxv vs1, 96(r4) +; CHECK-P9-NEXT: vmrglw v3, v5, v4 +; CHECK-P9-NEXT: vmrglh v0, v0, v1 ; CHECK-P9-NEXT: mfvsrwz r4, f3 ; CHECK-P9-NEXT: mtvsrd f3, r4 -; CHECK-P9-NEXT: mfvsrwz r4, f2 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: vmrglw v3, v5, v4 ; CHECK-P9-NEXT: xxmrgld vs4, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r4, f2 ; CHECK-P9-NEXT: xxswapd v2, vs3 -; CHECK-P9-NEXT: vmrglh v0, v0, v1 ; CHECK-P9-NEXT: mtvsrd f2, r4 +; CHECK-P9-NEXT: stxv vs4, 0(r3) ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mfvsrwz r4, f2 ; CHECK-P9-NEXT: mtvsrd f2, r4 ; CHECK-P9-NEXT: mfvsrwz r4, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r4 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r4, f1 ; CHECK-P9-NEXT: mtvsrd f1, r4 ; CHECK-P9-NEXT: mfvsrwz r4, f0 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs2 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 -; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 ; CHECK-P9-NEXT: xxmrgld vs0, v3, v2 ; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs4, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: lxv vs3, 32(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f5, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvdpsxws f6, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 -; CHECK-BE-NEXT: mfvsrwz r5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f2 +; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: mfvsrwz r5, f5 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f4 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f4, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f6 ; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f3 @@ -584,12 +584,15 @@ ; CHECK-BE-NEXT: vmrghh v4, v4, v1 ; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghh v5, v5, v1 ; CHECK-BE-NEXT: mfvsrwz r5, f0 ; CHECK-BE-NEXT: lxv vs0, 64(r4) +; CHECK-BE-NEXT: vmrghh v5, v5, v1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mfvsrwz r4, f3 -; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghh v0, v0, v1 ; CHECK-BE-NEXT: xxmrghd vs3, v3, v2 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f2 @@ -597,10 +600,12 @@ ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mfvsrwz r4, f2 ; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghw v2, v2, v0 ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xscvdpsxws f1, f0 @@ -613,11 +618,6 @@ ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f0 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: mtvsrd v1, r5 -; CHECK-BE-NEXT: vmrghh v0, v0, v1 -; CHECK-BE-NEXT: vmrghw v2, v2, v0 -; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v5, r4 ; CHECK-BE-NEXT: vmrghh v4, v4, v5 @@ -720,10 +720,10 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 @@ -733,12 +733,12 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 @@ -748,10 +748,10 @@ ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -837,12 +837,12 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 @@ -852,33 +852,33 @@ ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs3 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 @@ -888,12 +888,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -1054,128 +1054,128 @@ ; CHECK-P9-NEXT: lxv vs4, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) ; CHECK-P9-NEXT: lxv vs2, 32(r4) -; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: xscvdpsxws f6, f3 ; CHECK-P9-NEXT: lxv vs0, 64(r4) ; CHECK-P9-NEXT: xscvdpsxws f7, f2 ; CHECK-P9-NEXT: xscvdpsxws f8, f1 ; CHECK-P9-NEXT: xxswapd vs4, vs4 -; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: xscvdpsxws f9, f0 ; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xxswapd vs2, vs2 +; CHECK-P9-NEXT: xxswapd vs1, vs1 +; CHECK-P9-NEXT: xxswapd vs0, vs0 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mfvsrwz r5, f5 ; CHECK-P9-NEXT: mtvsrd f5, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f6 -; CHECK-P9-NEXT: xxswapd vs2, vs2 -; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mtvsrd f6, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f7 +; CHECK-P9-NEXT: xxswapd v2, vs5 ; CHECK-P9-NEXT: mtvsrd f7, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f8 ; CHECK-P9-NEXT: mtvsrd f8, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f9 ; CHECK-P9-NEXT: mtvsrd f9, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f4 +; CHECK-P9-NEXT: xxswapd v5, vs8 ; CHECK-P9-NEXT: mtvsrd f4, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f3 -; CHECK-P9-NEXT: xxswapd vs1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xxswapd v2, vs5 -; CHECK-P9-NEXT: xxswapd v5, vs8 ; CHECK-P9-NEXT: xxswapd v0, vs9 ; CHECK-P9-NEXT: mtvsrd f3, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f2 -; CHECK-P9-NEXT: mtvsrd f2, r5 -; CHECK-P9-NEXT: xxswapd vs0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: xxswapd v1, vs2 -; CHECK-P9-NEXT: lxv vs2, 80(r4) ; CHECK-P9-NEXT: xxswapd v3, vs4 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs6 +; CHECK-P9-NEXT: mtvsrd f2, r5 ; CHECK-P9-NEXT: xxswapd v4, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f2 -; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: mfvsrwz r5, f1 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs7 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-P9-NEXT: xxswapd v3, vs6 +; CHECK-P9-NEXT: xxswapd v1, vs2 +; CHECK-P9-NEXT: lxv vs2, 80(r4) ; CHECK-P9-NEXT: mtvsrd f1, r5 ; CHECK-P9-NEXT: mfvsrwz r5, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 +; CHECK-P9-NEXT: xxswapd v4, vs7 +; CHECK-P9-NEXT: mtvsrd f0, r5 +; CHECK-P9-NEXT: xscvdpsxws f3, f2 +; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: vmrglh v4, v4, v1 ; CHECK-P9-NEXT: xxswapd v1, vs1 -; CHECK-P9-NEXT: mtvsrd f0, r5 -; CHECK-P9-NEXT: vmrglh v5, v5, v1 +; CHECK-P9-NEXT: lxv vs1, 96(r4) +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglh v5, v5, v1 ; CHECK-P9-NEXT: xxswapd v1, vs0 ; CHECK-P9-NEXT: lxv vs0, 112(r4) -; CHECK-P9-NEXT: lxv vs1, 96(r4) +; CHECK-P9-NEXT: vmrglw v3, v5, v4 +; CHECK-P9-NEXT: vmrglh v0, v0, v1 ; CHECK-P9-NEXT: mfvsrwz r4, f3 ; CHECK-P9-NEXT: mtvsrd f3, r4 -; CHECK-P9-NEXT: mfvsrwz r4, f2 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: vmrglw v3, v5, v4 ; CHECK-P9-NEXT: xxmrgld vs4, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r4, f2 ; CHECK-P9-NEXT: xxswapd v2, vs3 -; CHECK-P9-NEXT: vmrglh v0, v0, v1 ; CHECK-P9-NEXT: mtvsrd f2, r4 +; CHECK-P9-NEXT: stxv vs4, 0(r3) ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mfvsrwz r4, f2 ; CHECK-P9-NEXT: mtvsrd f2, r4 ; CHECK-P9-NEXT: mfvsrwz r4, f1 +; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r4 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r4, f1 ; CHECK-P9-NEXT: mtvsrd f1, r4 ; CHECK-P9-NEXT: mfvsrwz r4, f0 -; CHECK-P9-NEXT: vmrglh v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs2 -; CHECK-P9-NEXT: vmrglh v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 -; CHECK-P9-NEXT: vmrglw v2, v2, v0 ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglh v4, v4, v5 ; CHECK-P9-NEXT: vmrglw v3, v4, v3 ; CHECK-P9-NEXT: xxmrgld vs0, v3, v2 ; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs4, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: lxv vs3, 32(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f5, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvdpsxws f6, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 -; CHECK-BE-NEXT: mfvsrwz r5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f2 +; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: mfvsrwz r5, f5 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f4 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f4, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f6 ; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mfvsrwz r5, f3 @@ -1207,12 +1207,15 @@ ; CHECK-BE-NEXT: vmrghh v4, v4, v1 ; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghh v5, v5, v1 ; CHECK-BE-NEXT: mfvsrwz r5, f0 ; CHECK-BE-NEXT: lxv vs0, 64(r4) +; CHECK-BE-NEXT: vmrghh v5, v5, v1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mfvsrwz r4, f3 -; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghh v0, v0, v1 ; CHECK-BE-NEXT: xxmrghd vs3, v3, v2 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f2 @@ -1220,10 +1223,12 @@ ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mfvsrwz r4, f2 ; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghw v2, v2, v0 ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f1 ; CHECK-BE-NEXT: xscvdpsxws f1, f0 @@ -1236,11 +1241,6 @@ ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: mfvsrwz r4, f0 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: mtvsrd v1, r5 -; CHECK-BE-NEXT: vmrghh v0, v0, v1 -; CHECK-BE-NEXT: vmrghw v2, v2, v0 -; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v5, r4 ; CHECK-BE-NEXT: vmrghh v4, v4, v5 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll @@ -131,10 +131,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: xvcvdpuxws v2, vs4 ; CHECK-P9-NEXT: xvcvdpuxws v3, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -151,10 +151,10 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs2, 16(r4) ; CHECK-BE-NEXT: lxv vs3, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: lxv vs1, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: xvcvdpuxws v2, vs4 ; CHECK-BE-NEXT: xvcvdpuxws v3, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -229,23 +229,23 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs6, 0(r4) ; CHECK-P9-NEXT: lxv vs7, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: lxv vs4, 32(r4) ; CHECK-P9-NEXT: lxv vs5, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-P9-NEXT: xvcvdpuxws v2, vs8 -; CHECK-P9-NEXT: xvcvdpuxws v3, vs6 ; CHECK-P9-NEXT: lxv vs2, 64(r4) ; CHECK-P9-NEXT: lxv vs3, 80(r4) +; CHECK-P9-NEXT: lxv vs0, 96(r4) +; CHECK-P9-NEXT: lxv vs1, 112(r4) +; CHECK-P9-NEXT: xvcvdpuxws v2, vs8 +; CHECK-P9-NEXT: xvcvdpuxws v3, vs6 ; CHECK-P9-NEXT: xvcvdpuxws v4, vs7 ; CHECK-P9-NEXT: vmrgew v2, v3, v2 ; CHECK-P9-NEXT: xvcvdpuxws v3, vs4 ; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-P9-NEXT: lxv vs0, 96(r4) -; CHECK-P9-NEXT: lxv vs1, 112(r4) ; CHECK-P9-NEXT: stxv v2, 0(r3) ; CHECK-P9-NEXT: xvcvdpuxws v5, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -265,23 +265,23 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs6, 16(r4) ; CHECK-BE-NEXT: lxv vs7, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: lxv vs4, 48(r4) ; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-BE-NEXT: xvcvdpuxws v2, vs8 -; CHECK-BE-NEXT: xvcvdpuxws v3, vs6 ; CHECK-BE-NEXT: lxv vs2, 80(r4) ; CHECK-BE-NEXT: lxv vs3, 64(r4) +; CHECK-BE-NEXT: lxv vs0, 112(r4) +; CHECK-BE-NEXT: lxv vs1, 96(r4) +; CHECK-BE-NEXT: xvcvdpuxws v2, vs8 +; CHECK-BE-NEXT: xvcvdpuxws v3, vs6 ; CHECK-BE-NEXT: xvcvdpuxws v4, vs7 ; CHECK-BE-NEXT: vmrgew v2, v3, v2 ; CHECK-BE-NEXT: xvcvdpuxws v3, vs4 ; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-BE-NEXT: lxv vs0, 112(r4) -; CHECK-BE-NEXT: lxv vs1, 96(r4) ; CHECK-BE-NEXT: stxv v2, 0(r3) ; CHECK-BE-NEXT: xvcvdpuxws v5, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -425,10 +425,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: xvcvdpsxws v2, vs4 ; CHECK-P9-NEXT: xvcvdpsxws v3, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -445,10 +445,10 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs2, 16(r4) ; CHECK-BE-NEXT: lxv vs3, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: lxv vs1, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: xvcvdpsxws v2, vs4 ; CHECK-BE-NEXT: xvcvdpsxws v3, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -523,23 +523,23 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs6, 0(r4) ; CHECK-P9-NEXT: lxv vs7, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: lxv vs4, 32(r4) ; CHECK-P9-NEXT: lxv vs5, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-P9-NEXT: xvcvdpsxws v2, vs8 -; CHECK-P9-NEXT: xvcvdpsxws v3, vs6 ; CHECK-P9-NEXT: lxv vs2, 64(r4) ; CHECK-P9-NEXT: lxv vs3, 80(r4) +; CHECK-P9-NEXT: lxv vs0, 96(r4) +; CHECK-P9-NEXT: lxv vs1, 112(r4) +; CHECK-P9-NEXT: xvcvdpsxws v2, vs8 +; CHECK-P9-NEXT: xvcvdpsxws v3, vs6 ; CHECK-P9-NEXT: xvcvdpsxws v4, vs7 ; CHECK-P9-NEXT: vmrgew v2, v3, v2 ; CHECK-P9-NEXT: xvcvdpsxws v3, vs4 ; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-P9-NEXT: lxv vs0, 96(r4) -; CHECK-P9-NEXT: lxv vs1, 112(r4) ; CHECK-P9-NEXT: stxv v2, 0(r3) ; CHECK-P9-NEXT: xvcvdpsxws v5, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -559,23 +559,23 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs6, 16(r4) ; CHECK-BE-NEXT: lxv vs7, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: lxv vs4, 48(r4) ; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-BE-NEXT: xvcvdpsxws v2, vs8 -; CHECK-BE-NEXT: xvcvdpsxws v3, vs6 ; CHECK-BE-NEXT: lxv vs2, 80(r4) ; CHECK-BE-NEXT: lxv vs3, 64(r4) +; CHECK-BE-NEXT: lxv vs0, 112(r4) +; CHECK-BE-NEXT: lxv vs1, 96(r4) +; CHECK-BE-NEXT: xvcvdpsxws v2, vs8 +; CHECK-BE-NEXT: xvcvdpsxws v3, vs6 ; CHECK-BE-NEXT: xvcvdpsxws v4, vs7 ; CHECK-BE-NEXT: vmrgew v2, v3, v2 ; CHECK-BE-NEXT: xvcvdpsxws v3, vs4 ; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-BE-NEXT: lxv vs0, 112(r4) -; CHECK-BE-NEXT: lxv vs1, 96(r4) ; CHECK-BE-NEXT: stxv v2, 0(r3) ; CHECK-BE-NEXT: xvcvdpsxws v5, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -104,10 +104,10 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 @@ -117,26 +117,26 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 +; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: vextuwrx r3, r3, v2 ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -225,12 +225,12 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 @@ -240,33 +240,33 @@ ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 @@ -277,12 +277,12 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -442,16 +442,16 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs7, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f8, f7 -; CHECK-P9-NEXT: xxswapd vs7, vs7 -; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: lxv vs0, 112(r3) ; CHECK-P9-NEXT: lxv vs1, 96(r3) ; CHECK-P9-NEXT: lxv vs2, 80(r3) +; CHECK-P9-NEXT: xscvdpsxws f8, f7 +; CHECK-P9-NEXT: xxswapd vs7, vs7 ; CHECK-P9-NEXT: lxv vs3, 64(r3) ; CHECK-P9-NEXT: lxv vs4, 48(r3) ; CHECK-P9-NEXT: lxv vs5, 32(r3) ; CHECK-P9-NEXT: lxv vs6, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: mfvsrwz r3, f8 ; CHECK-P9-NEXT: mtvsrd f8, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f7 @@ -461,59 +461,59 @@ ; CHECK-P9-NEXT: xscvdpsxws f7, f6 ; CHECK-P9-NEXT: xxswapd vs6, vs6 ; CHECK-P9-NEXT: xscvdpsxws f6, f6 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f7 ; CHECK-P9-NEXT: mtvsrd f7, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f6 +; CHECK-P9-NEXT: xxswapd v3, vs7 ; CHECK-P9-NEXT: mtvsrd f6, r3 ; CHECK-P9-NEXT: xxswapd v4, vs6 ; CHECK-P9-NEXT: xscvdpsxws f6, f5 ; CHECK-P9-NEXT: xxswapd vs5, vs5 ; CHECK-P9-NEXT: xscvdpsxws f5, f5 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f6 ; CHECK-P9-NEXT: mtvsrd f6, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f5 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs7 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs6 ; CHECK-P9-NEXT: mtvsrd f5, r3 ; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: xxswapd vs4, vs4 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f5 ; CHECK-P9-NEXT: mtvsrd f5, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 +; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: xxswapd v5, vs4 ; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: xxswapd vs3, vs3 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v5, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs4 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 @@ -525,12 +525,12 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: xxswapd v5, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 @@ -541,16 +541,16 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs7, 112(r3) -; CHECK-BE-NEXT: xscvdpsxws f8, f7 -; CHECK-BE-NEXT: xxswapd vs7, vs7 -; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: lxv vs6, 96(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f8, f7 +; CHECK-BE-NEXT: xxswapd vs7, vs7 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs3, 48(r3) ; CHECK-BE-NEXT: lxv vs4, 64(r3) ; CHECK-BE-NEXT: lxv vs5, 80(r3) +; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: mfvsrwz r3, f8 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -737,10 +737,10 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 @@ -750,26 +750,26 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 +; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: xxswapd v4, vs0 ; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: li r3, 0 ; CHECK-P9-NEXT: vextuwrx r3, r3, v2 ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mfvsrwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -858,12 +858,12 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 @@ -873,33 +873,33 @@ ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v3, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v4, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f2 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f1 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs2 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: xxswapd v4, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v5, vs0 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 @@ -910,12 +910,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mfvsrwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -1075,16 +1075,16 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs7, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f8, f7 -; CHECK-P9-NEXT: xxswapd vs7, vs7 -; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: lxv vs0, 112(r3) ; CHECK-P9-NEXT: lxv vs1, 96(r3) ; CHECK-P9-NEXT: lxv vs2, 80(r3) +; CHECK-P9-NEXT: xscvdpsxws f8, f7 +; CHECK-P9-NEXT: xxswapd vs7, vs7 ; CHECK-P9-NEXT: lxv vs3, 64(r3) ; CHECK-P9-NEXT: lxv vs4, 48(r3) ; CHECK-P9-NEXT: lxv vs5, 32(r3) ; CHECK-P9-NEXT: lxv vs6, 16(r3) +; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: mfvsrwz r3, f8 ; CHECK-P9-NEXT: mtvsrd f8, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f7 @@ -1094,59 +1094,59 @@ ; CHECK-P9-NEXT: xscvdpsxws f7, f6 ; CHECK-P9-NEXT: xxswapd vs6, vs6 ; CHECK-P9-NEXT: xscvdpsxws f6, f6 +; CHECK-P9-NEXT: vmrglb v2, v2, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f7 ; CHECK-P9-NEXT: mtvsrd f7, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f6 +; CHECK-P9-NEXT: xxswapd v3, vs7 ; CHECK-P9-NEXT: mtvsrd f6, r3 ; CHECK-P9-NEXT: xxswapd v4, vs6 ; CHECK-P9-NEXT: xscvdpsxws f6, f5 ; CHECK-P9-NEXT: xxswapd vs5, vs5 ; CHECK-P9-NEXT: xscvdpsxws f5, f5 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f6 ; CHECK-P9-NEXT: mtvsrd f6, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f5 -; CHECK-P9-NEXT: vmrglb v2, v2, v3 -; CHECK-P9-NEXT: xxswapd v3, vs7 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xxswapd v3, vs6 ; CHECK-P9-NEXT: mtvsrd f5, r3 ; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: xscvdpsxws f5, f4 ; CHECK-P9-NEXT: xxswapd vs4, vs4 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f5 ; CHECK-P9-NEXT: mtvsrd f5, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 +; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: mtvsrd f4, r3 ; CHECK-P9-NEXT: xxswapd v5, vs4 ; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: xxswapd vs3, vs3 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs5 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f4 ; CHECK-P9-NEXT: mtvsrd f4, r3 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: xxswapd v3, vs4 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: xscvdpsxws f3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 ; CHECK-P9-NEXT: mfvsrwz r3, f3 ; CHECK-P9-NEXT: mtvsrd f3, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: mtvsrd f2, r3 ; CHECK-P9-NEXT: xxswapd v5, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs4 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: xxswapd v4, vs3 ; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: vmrglh v3, v4, v3 ; CHECK-P9-NEXT: mfvsrwz r3, f2 @@ -1158,12 +1158,12 @@ ; CHECK-P9-NEXT: xscvdpsxws f1, f0 ; CHECK-P9-NEXT: xxswapd vs0, vs0 ; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 ; CHECK-P9-NEXT: xxswapd v5, vs1 +; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 @@ -1174,16 +1174,16 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs7, 112(r3) -; CHECK-BE-NEXT: xscvdpsxws f8, f7 -; CHECK-BE-NEXT: xxswapd vs7, vs7 -; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: lxv vs6, 96(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f8, f7 +; CHECK-BE-NEXT: xxswapd vs7, vs7 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs3, 48(r3) ; CHECK-BE-NEXT: lxv vs4, 64(r3) ; CHECK-BE-NEXT: lxv vs5, 80(r3) +; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: mfvsrwz r3, f8 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll @@ -40,9 +40,9 @@ ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: li r3, 2 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextuhrx r3, r3, v2 ; CHECK-P9-NEXT: rlwinm r3, r3, 0, 16, 31 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1 ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 @@ -94,10 +94,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxwsp v2, v2 ; CHECK-P9-NEXT: blr @@ -106,9 +106,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxwsp v2, v2 ; CHECK-BE-NEXT: blr @@ -140,9 +140,9 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -157,9 +157,9 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -210,9 +210,9 @@ ; CHECK-P9-NEXT: lxv v2, 16(r4) ; CHECK-P9-NEXT: lxv v3, 0(r4) ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v4, 0, r4 -; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v0, v5, v3, v4 @@ -235,9 +235,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v0, v3, v5, v4 @@ -292,9 +292,9 @@ ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: li r3, 2 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextuhrx r3, r3, v2 ; CHECK-P9-NEXT: extsh r3, r3 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1 ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 @@ -397,9 +397,9 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vmrghh v2, v2, v2 ; CHECK-BE-NEXT: vextsh2w v3, v3 @@ -454,10 +454,10 @@ ; CHECK-P9-NEXT: vmrglh v4, v3, v3 ; CHECK-P9-NEXT: vmrghh v3, v3, v3 ; CHECK-P9-NEXT: vextsh2w v3, v3 +; CHECK-P9-NEXT: vextsh2w v4, v4 ; CHECK-P9-NEXT: xvcvsxwsp vs1, v3 ; CHECK-P9-NEXT: vmrglh v3, v2, v2 ; CHECK-P9-NEXT: vmrghh v2, v2, v2 -; CHECK-P9-NEXT: vextsh2w v4, v4 ; CHECK-P9-NEXT: xvcvsxwsp vs0, v4 ; CHECK-P9-NEXT: vextsh2w v3, v3 ; CHECK-P9-NEXT: vextsh2w v2, v2 @@ -474,9 +474,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: vperm v0, v5, v3, v4 ; CHECK-BE-NEXT: vperm v4, v5, v2, v4 ; CHECK-BE-NEXT: vmrghh v3, v3, v3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll @@ -26,9 +26,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxddp v2, v2 ; CHECK-P9-NEXT: blr @@ -37,9 +37,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxddp v2, v2 ; CHECK-BE-NEXT: blr @@ -76,10 +76,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -95,9 +95,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -155,9 +155,9 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -184,9 +184,9 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -279,9 +279,9 @@ ; CHECK-P9-NEXT: lxv v2, 16(r4) ; CHECK-P9-NEXT: lxv v3, 0(r4) ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v4, 0, r4 -; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v0, v5, v3, v4 @@ -322,9 +322,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v0, v3, v5, v4 @@ -446,11 +446,11 @@ ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI5_0@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI5_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1@toc@l +; CHECK-P9-NEXT: vperm v3, v2, v2, v3 ; CHECK-P9-NEXT: vextsh2d v3, v3 ; CHECK-P9-NEXT: xvcvsxddp vs0, v3 ; CHECK-P9-NEXT: lxvx v3, 0, r4 @@ -465,13 +465,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha -; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -570,12 +570,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 @@ -686,8 +686,8 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l ; CHECK-P9-NEXT: lxv v2, 0(r4) +; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r5 ; CHECK-P9-NEXT: addis r5, r2, .LCPI7_1@toc@ha ; CHECK-P9-NEXT: addi r5, r5, .LCPI7_1@toc@l @@ -706,16 +706,17 @@ ; CHECK-P9-NEXT: xvcvsxddp vs1, v4 ; CHECK-P9-NEXT: vperm v4, v2, v2, v0 ; CHECK-P9-NEXT: vperm v2, v2, v2, v1 +; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: vextsh2d v4, v4 ; CHECK-P9-NEXT: xvcvsxddp vs2, v4 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: stxv vs1, 16(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: xvcvsxddp vs3, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v3 ; CHECK-P9-NEXT: stxv vs2, 32(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: stxv vs3, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 16(r3) ; CHECK-P9-NEXT: xvcvsxddp vs4, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v5 ; CHECK-P9-NEXT: vextsh2d v2, v2 @@ -726,60 +727,59 @@ ; CHECK-P9-NEXT: xvcvsxddp vs6, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v1 ; CHECK-P9-NEXT: stxv vs5, 80(r3) -; CHECK-P9-NEXT: stxv vs6, 96(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: xvcvsxddp vs7, v2 +; CHECK-P9-NEXT: stxv vs6, 96(r3) ; CHECK-P9-NEXT: stxv vs7, 112(r3) -; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l -; CHECK-BE-NEXT: lxvx v2, 0, r5 ; CHECK-BE-NEXT: lxv v4, 0(r4) ; CHECK-BE-NEXT: lxv v1, 16(r4) +; CHECK-BE-NEXT: xxlxor v5, v5, v5 +; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha +; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l +; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l +; CHECK-BE-NEXT: lxvx v2, 0, r5 ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_1@toc@l -; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha -; CHECK-BE-NEXT: xxlxor v5, v5, v5 -; CHECK-BE-NEXT: vperm v0, v5, v4, v2 ; CHECK-BE-NEXT: lxvx v3, 0, r5 +; CHECK-BE-NEXT: vperm v0, v5, v4, v2 ; CHECK-BE-NEXT: vperm v2, v5, v1, v2 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l ; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs2, v2 ; CHECK-BE-NEXT: vperm v2, v5, v1, v3 +; CHECK-BE-NEXT: xvcvsxddp vs0, v0 +; CHECK-BE-NEXT: vperm v0, v5, v4, v3 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: stxv vs2, 80(r3) +; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs3, v2 ; CHECK-BE-NEXT: lxvx v2, 0, r4 -; CHECK-BE-NEXT: xvcvsxddp vs0, v0 -; CHECK-BE-NEXT: vperm v0, v5, v4, v3 -; CHECK-BE-NEXT: vperm v3, v4, v4, v2 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha -; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs1, v0 +; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l +; CHECK-BE-NEXT: stxv vs2, 80(r3) +; CHECK-BE-NEXT: stxv vs0, 16(r3) +; CHECK-BE-NEXT: vperm v3, v4, v4, v2 +; CHECK-BE-NEXT: vperm v2, v1, v1, v2 +; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs1, 48(r3) ; CHECK-BE-NEXT: vextsh2d v3, v3 -; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l +; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: xvcvsxddp vs4, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: vperm v2, v1, v1, v2 -; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: xvcvsxddp vs6, v2 -; CHECK-BE-NEXT: vperm v2, v1, v1, v3 ; CHECK-BE-NEXT: vperm v4, v4, v4, v3 +; CHECK-BE-NEXT: vperm v2, v1, v1, v3 +; CHECK-BE-NEXT: stxv vs6, 64(r3) +; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: vextsh2d v4, v4 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: xvcvsxddp vs7, v2 ; CHECK-BE-NEXT: xvcvsxddp vs5, v4 -; CHECK-BE-NEXT: stxv vs3, 112(r3) -; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs0, 16(r3) -; CHECK-BE-NEXT: stxv vs4, 0(r3) +; CHECK-BE-NEXT: xvcvsxddp vs7, v2 ; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll @@ -106,8 +106,8 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvuxwdp vs2, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvuxwdp vs1, v2 @@ -124,8 +124,8 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: lxv vs0, 16(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvuxwdp vs2, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvuxwdp vs1, v2 @@ -196,12 +196,12 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: lxv vs2, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: lxv vs4, 48(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: xvcvuxwdp vs1, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0 -; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: xvcvuxwdp vs0, v2 ; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2 ; CHECK-P9-NEXT: xvcvuxwdp vs3, v2 @@ -228,12 +228,12 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: xvcvuxwdp vs1, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0 -; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: xvcvuxwdp vs0, v2 ; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2 ; CHECK-BE-NEXT: xvcvuxwdp vs3, v2 @@ -360,8 +360,8 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvsxwdp vs2, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvsxwdp vs1, v2 @@ -378,8 +378,8 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: lxv vs0, 16(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvsxwdp vs2, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvsxwdp vs1, v2 @@ -450,12 +450,12 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: lxv vs2, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: lxv vs4, 48(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: xvcvsxwdp vs1, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0 -; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: xvcvsxwdp vs0, v2 ; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2 ; CHECK-P9-NEXT: xvcvsxwdp vs3, v2 @@ -482,12 +482,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: xvcvsxwdp vs1, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0 -; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: xvcvsxwdp vs0, v2 ; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2 ; CHECK-BE-NEXT: xvcvsxwdp vs3, v2 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll @@ -74,8 +74,8 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 ; CHECK-P9-NEXT: lxv v2, 16(r3) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v2 ; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -85,8 +85,8 @@ ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v3, 16(r3) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 ; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v2 ; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -129,14 +129,14 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v5, 0(r4) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: lxv v3, 32(r4) +; CHECK-P9-NEXT: lxv v2, 48(r4) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 32(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 48(r4) ; CHECK-P9-NEXT: vpkudum v3, v4, v5 ; CHECK-P9-NEXT: stxv v3, 0(r3) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -149,14 +149,14 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v5, 16(r4) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 ; CHECK-BE-NEXT: lxv v4, 0(r4) +; CHECK-BE-NEXT: lxv v3, 48(r4) +; CHECK-BE-NEXT: lxv v2, 32(r4) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 48(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 32(r4) ; CHECK-BE-NEXT: vpkudum v3, v4, v5 ; CHECK-BE-NEXT: stxv v3, 0(r3) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -227,30 +227,30 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v7, 0(r4) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v7 ; CHECK-P9-NEXT: lxv v6, 16(r4) +; CHECK-P9-NEXT: lxv v1, 32(r4) +; CHECK-P9-NEXT: lxv v0, 48(r4) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v7 +; CHECK-P9-NEXT: lxv v5, 64(r4) +; CHECK-P9-NEXT: lxv v4, 80(r4) +; CHECK-P9-NEXT: lxv v3, 96(r4) +; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v6 -; CHECK-P9-NEXT: lxv v1, 32(r4) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v1 -; CHECK-P9-NEXT: lxv v0, 48(r4) ; CHECK-P9-NEXT: vpkudum v1, v6, v7 +; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v0 -; CHECK-P9-NEXT: lxv v5, 64(r4) -; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 -; CHECK-P9-NEXT: lxv v4, 80(r4) ; CHECK-P9-NEXT: vpkudum v0, v0, v6 ; CHECK-P9-NEXT: stxv v0, 16(r3) ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 96(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: vpkudum v4, v4, v5 ; CHECK-P9-NEXT: stxv v4, 32(r3) ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -263,30 +263,30 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v7, 16(r4) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v7 ; CHECK-BE-NEXT: lxv v6, 0(r4) +; CHECK-BE-NEXT: lxv v1, 48(r4) +; CHECK-BE-NEXT: lxv v0, 32(r4) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v7 +; CHECK-BE-NEXT: lxv v5, 80(r4) +; CHECK-BE-NEXT: lxv v4, 64(r4) +; CHECK-BE-NEXT: lxv v3, 112(r4) +; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v6 -; CHECK-BE-NEXT: lxv v1, 48(r4) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v1 -; CHECK-BE-NEXT: lxv v0, 32(r4) ; CHECK-BE-NEXT: vpkudum v1, v6, v7 +; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v0 -; CHECK-BE-NEXT: lxv v5, 80(r4) -; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 -; CHECK-BE-NEXT: lxv v4, 64(r4) ; CHECK-BE-NEXT: vpkudum v0, v0, v6 ; CHECK-BE-NEXT: stxv v0, 16(r3) ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 112(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: vpkudum v4, v4, v5 ; CHECK-BE-NEXT: stxv v4, 32(r3) ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -367,8 +367,8 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 ; CHECK-P9-NEXT: lxv v2, 16(r3) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v2 ; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -378,8 +378,8 @@ ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v3, 16(r3) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 ; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v2 ; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -422,14 +422,14 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v5, 0(r4) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: lxv v3, 32(r4) +; CHECK-P9-NEXT: lxv v2, 48(r4) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 32(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 48(r4) ; CHECK-P9-NEXT: vpkudum v3, v4, v5 ; CHECK-P9-NEXT: stxv v3, 0(r3) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -442,14 +442,14 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v5, 16(r4) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 ; CHECK-BE-NEXT: lxv v4, 0(r4) +; CHECK-BE-NEXT: lxv v3, 48(r4) +; CHECK-BE-NEXT: lxv v2, 32(r4) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 48(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 32(r4) ; CHECK-BE-NEXT: vpkudum v3, v4, v5 ; CHECK-BE-NEXT: stxv v3, 0(r3) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -520,30 +520,30 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v7, 0(r4) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v7 ; CHECK-P9-NEXT: lxv v6, 16(r4) +; CHECK-P9-NEXT: lxv v1, 32(r4) +; CHECK-P9-NEXT: lxv v0, 48(r4) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v7 +; CHECK-P9-NEXT: lxv v5, 64(r4) +; CHECK-P9-NEXT: lxv v4, 80(r4) +; CHECK-P9-NEXT: lxv v3, 96(r4) +; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v6 -; CHECK-P9-NEXT: lxv v1, 32(r4) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v1 -; CHECK-P9-NEXT: lxv v0, 48(r4) ; CHECK-P9-NEXT: vpkudum v1, v6, v7 +; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v0 -; CHECK-P9-NEXT: lxv v5, 64(r4) -; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 -; CHECK-P9-NEXT: lxv v4, 80(r4) ; CHECK-P9-NEXT: vpkudum v0, v0, v6 ; CHECK-P9-NEXT: stxv v0, 16(r3) ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 96(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: vpkudum v4, v4, v5 ; CHECK-P9-NEXT: stxv v4, 32(r3) ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -556,30 +556,30 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v7, 16(r4) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v7 ; CHECK-BE-NEXT: lxv v6, 0(r4) +; CHECK-BE-NEXT: lxv v1, 48(r4) +; CHECK-BE-NEXT: lxv v0, 32(r4) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v7 +; CHECK-BE-NEXT: lxv v5, 80(r4) +; CHECK-BE-NEXT: lxv v4, 64(r4) +; CHECK-BE-NEXT: lxv v3, 112(r4) +; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v6 -; CHECK-BE-NEXT: lxv v1, 48(r4) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v1 -; CHECK-BE-NEXT: lxv v0, 32(r4) ; CHECK-BE-NEXT: vpkudum v1, v6, v7 +; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v0 -; CHECK-BE-NEXT: lxv v5, 80(r4) -; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 -; CHECK-BE-NEXT: lxv v4, 64(r4) ; CHECK-BE-NEXT: vpkudum v0, v0, v6 ; CHECK-BE-NEXT: stxv v0, 16(r3) ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 112(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: vpkudum v4, v4, v5 ; CHECK-BE-NEXT: stxv v4, 32(r3) ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll @@ -40,9 +40,9 @@ ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextubrx r3, r3, v2 ; CHECK-P9-NEXT: rlwinm r3, r3, 0, 24, 31 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1 ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 @@ -94,9 +94,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxwsp v2, v2 ; CHECK-P9-NEXT: blr @@ -105,9 +105,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxwsp v2, v2 ; CHECK-BE-NEXT: blr @@ -142,10 +142,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -161,9 +161,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -217,9 +217,9 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -246,9 +246,9 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -308,9 +308,9 @@ ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextubrx r3, r3, v2 ; CHECK-P9-NEXT: extsb r3, r3 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1 ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 @@ -419,11 +419,11 @@ ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-P9-NEXT: vperm v3, v2, v2, v3 ; CHECK-P9-NEXT: vextsb2w v3, v3 ; CHECK-P9-NEXT: xvcvsxwsp vs0, v3 ; CHECK-P9-NEXT: lxvx v3, 0, r4 @@ -438,13 +438,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha -; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -537,12 +537,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll @@ -26,9 +26,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxddp v2, v2 ; CHECK-P9-NEXT: blr @@ -37,9 +37,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxddp v2, v2 ; CHECK-BE-NEXT: blr @@ -76,9 +76,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -94,9 +94,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -157,10 +157,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -188,9 +188,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -292,9 +292,9 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -345,9 +345,9 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -498,13 +498,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha -; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -575,11 +575,11 @@ ; CHECK-P9-NEXT: mtvsrd f0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-NEXT: lxvx v3, 0, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-P9-NEXT: vperm v3, v2, v2, v3 ; CHECK-P9-NEXT: vextsb2d v3, v3 ; CHECK-P9-NEXT: xvcvsxddp vs0, v3 ; CHECK-P9-NEXT: lxvx v3, 0, r4 @@ -608,13 +608,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha -; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha @@ -795,12 +795,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v4, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l +; CHECK-BE-NEXT: vperm v4, v3, v2, v4 ; CHECK-BE-NEXT: vextsb2d v4, v4 ; CHECK-BE-NEXT: xvcvsxddp vs0, v4 ; CHECK-BE-NEXT: lxvx v4, 0, r4 diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -313,8 +313,8 @@ ; PC64LE9-NEXT: addis 3, 2, .LCPI6_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -398,24 +398,24 @@ ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI7_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3) ; PC64LE9-NEXT: bl fmodf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI7_3@toc@ha ; PC64LE9-NEXT: fmr 29, 1 -; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3) ; PC64LE9-NEXT: bl fmodf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -486,27 +486,27 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI8_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI8_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI8_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI8_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfs 1, .LCPI8_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -588,34 +588,34 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI9_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI9_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI9_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI9_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI9_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfs 1, .LCPI9_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI9_4@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -951,23 +951,23 @@ ; PC64LE9-LABEL: constrained_vector_fadd_v3f32: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI17_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfs 0, .LCPI17_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI17_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI17_1@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI17_2@toc@ha -; PC64LE9-NEXT: xsaddsp 2, 0, 2 ; PC64LE9-NEXT: lfs 3, .LCPI17_2@toc@l(3) -; PC64LE9-NEXT: xxlxor 1, 1, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l ; PC64LE9-NEXT: xsaddsp 1, 0, 1 +; PC64LE9-NEXT: lxvx 36, 0, 3 +; PC64LE9-NEXT: xsaddsp 2, 0, 2 ; PC64LE9-NEXT: xsaddsp 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 0 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 1 -; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -1006,9 +1006,9 @@ ; PC64LE9-LABEL: constrained_vector_fadd_v3f64: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI18_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfd 0, .LCPI18_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI18_1@toc@ha -; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI18_1@toc@l ; PC64LE9-NEXT: xsadddp 3, 0, 1 ; PC64LE9-NEXT: lxvx 0, 0, 3 @@ -1167,23 +1167,23 @@ ; PC64LE9-LABEL: constrained_vector_fsub_v3f32: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI22_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfs 0, .LCPI22_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI22_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI22_1@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI22_2@toc@ha -; PC64LE9-NEXT: xssubsp 2, 0, 2 ; PC64LE9-NEXT: lfs 3, .LCPI22_2@toc@l(3) -; PC64LE9-NEXT: xxlxor 1, 1, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l ; PC64LE9-NEXT: xssubsp 1, 0, 1 +; PC64LE9-NEXT: lxvx 36, 0, 3 +; PC64LE9-NEXT: xssubsp 2, 0, 2 ; PC64LE9-NEXT: xssubsp 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 0 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 1 -; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -1222,9 +1222,9 @@ ; PC64LE9-LABEL: constrained_vector_fsub_v3f64: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI23_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfd 0, .LCPI23_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI23_1@toc@ha -; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI23_1@toc@l ; PC64LE9-NEXT: xssubdp 3, 0, 1 ; PC64LE9-NEXT: lxvx 0, 0, 3 @@ -1562,8 +1562,8 @@ ; PC64LE9-NEXT: addis 3, 2, .LCPI31_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -1647,24 +1647,24 @@ ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI32_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3) ; PC64LE9-NEXT: bl powf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI32_3@toc@ha ; PC64LE9-NEXT: fmr 29, 1 -; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3) ; PC64LE9-NEXT: bl powf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -1735,27 +1735,27 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI33_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI33_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI33_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI33_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI33_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI33_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI33_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -1837,34 +1837,34 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI34_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfd 1, .LCPI34_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI34_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI34_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI34_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI34_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI34_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI34_4@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -1912,8 +1912,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -32(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI35_0@toc@ha -; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 @@ -1965,15 +1965,15 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -48(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI36_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI36_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -2045,30 +2045,30 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -48(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI37_0@toc@ha -; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI37_1@toc@ha ; PC64LE9-NEXT: fmr 31, 1 -; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI37_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -2134,24 +2134,24 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI38_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI38_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI38_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: xxmrghd 63, 0, 1 ; PC64LE9-NEXT: lfd 1, .LCPI38_2@toc@l(3) -; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -2228,31 +2228,31 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI39_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI39_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI39_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI39_2@toc@l(3) -; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI39_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -2432,12 +2432,12 @@ ; PC64LE9-NEXT: bl sinf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -2500,8 +2500,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI43_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3) ; PC64LE9-NEXT: bl sin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI43_1@toc@ha @@ -2586,8 +2586,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI44_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3) ; PC64LE9-NEXT: bl sin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI44_1@toc@ha @@ -2785,12 +2785,12 @@ ; PC64LE9-NEXT: bl cosf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -2853,8 +2853,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI48_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3) ; PC64LE9-NEXT: bl cos ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI48_1@toc@ha @@ -2939,8 +2939,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI49_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3) ; PC64LE9-NEXT: bl cos ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI49_1@toc@ha @@ -3138,12 +3138,12 @@ ; PC64LE9-NEXT: bl expf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -3206,8 +3206,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI53_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3) ; PC64LE9-NEXT: bl exp ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI53_1@toc@ha @@ -3292,8 +3292,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI54_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3) ; PC64LE9-NEXT: bl exp ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI54_1@toc@ha @@ -3491,12 +3491,12 @@ ; PC64LE9-NEXT: bl exp2f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -3559,8 +3559,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI58_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3) ; PC64LE9-NEXT: bl exp2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI58_1@toc@ha @@ -3645,8 +3645,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI59_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3) ; PC64LE9-NEXT: bl exp2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI59_1@toc@ha @@ -3844,12 +3844,12 @@ ; PC64LE9-NEXT: bl logf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -3912,8 +3912,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI63_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3) ; PC64LE9-NEXT: bl log ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI63_1@toc@ha @@ -3998,8 +3998,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI64_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3) ; PC64LE9-NEXT: bl log ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI64_1@toc@ha @@ -4197,12 +4197,12 @@ ; PC64LE9-NEXT: bl log10f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -4265,8 +4265,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI68_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3) ; PC64LE9-NEXT: bl log10 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI68_1@toc@ha @@ -4351,8 +4351,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI69_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3) ; PC64LE9-NEXT: bl log10 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI69_1@toc@ha @@ -4550,12 +4550,12 @@ ; PC64LE9-NEXT: bl log2f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -4618,8 +4618,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI73_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3) ; PC64LE9-NEXT: bl log2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI73_1@toc@ha @@ -4704,8 +4704,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI74_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3) ; PC64LE9-NEXT: bl log2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI74_1@toc@ha @@ -4903,12 +4903,12 @@ ; PC64LE9-NEXT: bl rintf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI77_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI77_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI77_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI77_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -4971,8 +4971,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI78_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI78_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI78_0@toc@l(3) ; PC64LE9-NEXT: bl rint ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI78_1@toc@ha @@ -5057,8 +5057,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI79_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI79_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI79_0@toc@l(3) ; PC64LE9-NEXT: bl rint ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI79_1@toc@ha @@ -5256,12 +5256,12 @@ ; PC64LE9-NEXT: bl nearbyintf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 1 @@ -5324,8 +5324,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI83_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI83_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI83_0@toc@l(3) ; PC64LE9-NEXT: bl nearbyint ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI83_1@toc@ha @@ -5410,8 +5410,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI84_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI84_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI84_0@toc@l(3) ; PC64LE9-NEXT: bl nearbyint ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI84_1@toc@ha @@ -5629,19 +5629,19 @@ ; PC64LE9-NEXT: bl fmaxf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI87_4@toc@ha -; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3) ; PC64LE9-NEXT: fmr 29, 1 ; PC64LE9-NEXT: fmr 1, 31 +; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3) ; PC64LE9-NEXT: bl fmaxf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -5710,10 +5710,10 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI88_0@toc@ha +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI88_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI88_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI88_1@toc@l(3) -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmax ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI88_2@toc@ha @@ -5810,10 +5810,10 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI89_0@toc@ha +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI89_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI89_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI89_1@toc@l(3) -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmax ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI89_2@toc@ha @@ -6038,19 +6038,19 @@ ; PC64LE9-NEXT: bl fminf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI92_4@toc@ha -; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3) ; PC64LE9-NEXT: fmr 29, 1 ; PC64LE9-NEXT: fmr 1, 31 +; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3) ; PC64LE9-NEXT: bl fminf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrglw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 1 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -6119,10 +6119,10 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI93_0@toc@ha +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI93_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI93_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI93_1@toc@l(3) -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI93_2@toc@ha @@ -6219,10 +6219,10 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI94_0@toc@ha +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI94_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI94_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI94_1@toc@l(3) -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI94_2@toc@ha diff --git a/llvm/test/CodeGen/X86/testb-je-fusion.ll b/llvm/test/CodeGen/X86/testb-je-fusion.ll --- a/llvm/test/CodeGen/X86/testb-je-fusion.ll +++ b/llvm/test/CodeGen/X86/testb-je-fusion.ll @@ -238,8 +238,8 @@ ; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_alu_je: ; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry ; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB2_2 ; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -249,8 +249,8 @@ ; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_alu_je: ; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB2_2 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -340,8 +340,8 @@ ; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_dec_je: ; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry ; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax ; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax ; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB3_2 ; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -351,8 +351,8 @@ ; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_dec_je: ; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB3_2 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/topdepthreduce-postra.mir b/llvm/test/CodeGen/X86/topdepthreduce-postra.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/topdepthreduce-postra.mir @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64 -enable-post-misched -run-pass=postmisched -o - %s | FileCheck %s +--- +# Check that postmisched's TopDepthReduce heuristic moves the DEC32r later +# because of the dependency on eax +name: test +body: | + bb.0: + ; CHECK-LABEL: name: test + ; CHECK: $eax = MOV32rr killed $edi + ; CHECK: MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1) + ; CHECK: renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags + $eax = MOV32rr $edi + renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags + MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1) +...