diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -92,6 +92,8 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFSHR(SDValue Op, SelectionDAG &DAG) const; + protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -408,7 +408,7 @@ } // The hardware supports 32-bit FSHR, but not FSHL. - setOperationAction(ISD::FSHR, MVT::i32, Legal); + setOperationAction(ISD::FSHR, MVT::i32, Custom); // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); @@ -1330,6 +1330,8 @@ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_CTTZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::FSHR: + return LowerFSHR(Op, DAG); } return Op; } @@ -3414,6 +3416,12 @@ return DAG.getBuildVector(VT, DL, Args); } +SDValue AMDGPUTargetLowering::LowerFSHR(SDValue Op, SelectionDAG &DAG) const { + // There is no SALU instruction corresponding to fshr. We don't want to use + // the VALU instruction for a uniform operation so expand it. + return Op->isDivergent() ? Op : SDValue(); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -625,9 +625,10 @@ ; GCN-LABEL: test_arg_store_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -638,9 +639,10 @@ ; GFX7-LABEL: test_arg_store_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -675,11 +677,12 @@ ; GCN-LABEL: test_arg_store_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 @@ -690,9 +693,10 @@ ; GFX7-LABEL: test_arg_store_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 @@ -734,11 +738,13 @@ ; GCN-LABEL: test_arg_store_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -749,11 +755,13 @@ ; GFX7-LABEL: test_arg_store_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -788,37 +796,45 @@ ; GCN-LABEL: test_arg_store_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v11, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -850,54 +866,70 @@ ; GCN-LABEL: test_arg_store_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v18, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v19, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v20 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v19, v12, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v14, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v12, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v10, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v8, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1707,10 +1739,11 @@ ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen @@ -1742,8 +1775,9 @@ ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -162,8 +162,8 @@ ; GFX8-LABEL: undef_lo2_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND @@ -187,8 +187,8 @@ ; GFX8-LABEL: undef_lo2_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -71,7 +71,10 @@ ; R600-NOT: MOV ; GFX6: s_mov_b32 s3, 0xf000 ; GFX6: s_waitcnt lgkmcnt(0) -; GFX6: v_alignbit_b32 v0, 5, s4, 16 +; GFX6: s_lshr_b32 s2, s2, 16 +; GFX6: s_or_b32 s4, s2, 0x50000 +; GFX6: s_mov_b32 s2, -1 +; GFX6: v_mov_b32_e32 v0, s4 ; GFX6: buffer_store_dword v0, off, s[0:3], 0 ; GFX8: s_mov_b32 s3, 0xf000 ; GFX8: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -450,13 +450,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1233,10 +1233,12 @@ ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -239,39 +239,41 @@ ; ; EG-LABEL: fp_to_sint_i64: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 42, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T0.W, KC0[2].Z, literal.x, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.y, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T1.W, PS, literal.x, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) +; EG-NEXT: BFE_UINT * T0.W, KC0[2].Z, literal.x, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T2.W, KC0[2].Z, literal.y, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: OR_INT T0.Z, PS, literal.x, +; EG-NEXT: NOT_INT T2.W, PV.W, +; EG-NEXT: SUB_INT * T3.W, literal.y, T0.W, +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) ; EG-NEXT: ADD_INT T0.X, T0.W, literal.x, -; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W, -; EG-NEXT: AND_INT T0.Z, PS, literal.z, -; EG-NEXT: NOT_INT T0.W, PS, -; EG-NEXT: LSHR * T3.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T1.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT T0.Y, T1.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T1.Z, PS, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.y, +; EG-NEXT: LSHR * T2.W, PV.Z, 1, +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: LSHR T1.X, PS, PV.W, +; EG-NEXT: LSHR T1.Y, T0.Z, PV.Z, +; EG-NEXT: AND_INT T1.Z, T3.W, literal.x, +; EG-NEXT: LSHL T0.W, T0.Z, PV.Y, +; EG-NEXT: AND_INT * T1.W, T1.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: CNDE_INT T0.W, PV.Z, PV.X, PV.Y, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, ; EG-NEXT: SETGT_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T0.W, PS, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT T1.Z, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T0.W, PS, PV.Z, PV.Y, ; EG-NEXT: ASHR * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.W, PV.W, PS, @@ -364,7 +366,7 @@ ; ; EG-LABEL: fp_to_sint_v2i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 80, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -374,72 +376,77 @@ ; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z, -; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44) +; EG-NEXT: SUB_INT T2.W, literal.y, PV.W, +; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, +; EG-NEXT: 8388607(1.175494e-38), 150(2.101948e-43) ; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: SUB_INT T0.X, literal.x, PV.W, -; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: OR_INT T3.W, PV.Z, literal.z, -; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w, -; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) -; EG-NEXT: 8388608(1.175494e-38), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T1.X, PS, literal.x, -; EG-NEXT: LSHL T1.Y, PV.W, PV.Z, +; EG-NEXT: AND_INT T0.X, KC0[3].X, literal.x, +; EG-NEXT: AND_INT T0.Y, PS, literal.y, +; EG-NEXT: AND_INT T1.Z, PV.W, literal.y, +; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.z, T0.W, +; EG-NEXT: OR_INT * T4.W, PV.Z, literal.w, +; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44) +; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) +; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, +; EG-NEXT: LSHR T1.Y, PS, PV.Z, ; EG-NEXT: AND_INT T0.Z, T2.W, literal.y, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y, -; EG-NEXT: AND_INT * T5.W, PV.Y, literal.y, -; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0, -; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X, -; EG-NEXT: AND_INT * T5.W, T0.X, literal.y, +; EG-NEXT: LSHL T2.W, PS, PV.Y, +; EG-NEXT: AND_INT * T5.W, T3.W, literal.y, ; EG-NEXT: -150(nan), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, -; EG-NEXT: NOT_INT T2.Y, T2.W, -; EG-NEXT: AND_INT T2.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T2.W, PV.Z, -; EG-NEXT: LSHR * T4.W, T1.X, 1, +; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: AND_INT T6.W, PV.X, literal.x, +; EG-NEXT: OR_INT * T7.W, T0.X, literal.y, +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) +; EG-NEXT: NOT_INT T0.X, T1.X, +; EG-NEXT: SUB_INT T1.Y, literal.x, T0.W, +; EG-NEXT: NOT_INT T1.Z, T3.W, +; EG-NEXT: LSHL T3.W, PS, PV.W, +; EG-NEXT: AND_INT * T6.W, T1.X, literal.y, +; EG-NEXT: 150(2.101948e-43), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: AND_INT T2.Y, PV.Z, literal.x, +; EG-NEXT: AND_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: AND_INT T8.W, PV.X, literal.x, +; EG-NEXT: LSHR * T9.W, T7.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, T3.W, 1, +; EG-NEXT: LSHR T0.X, T4.W, 1, ; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T1.X, PV.Z, -; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y, +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T0.W, T7.W, PV.Z, BS:VEC_201 +; EG-NEXT: AND_INT * T4.W, T1.Y, literal.y, ; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, +; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, T6.W, PV.Z, T3.W, ; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y, -; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y, +; EG-NEXT: ADD_INT T0.W, T1.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T1.W, PV.X, T2.Y, ; EG-NEXT: 23(3.222986e-44), -127(nan) -; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y, -; EG-NEXT: SETGT_INT T1.Y, PS, literal.x, -; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X, +; EG-NEXT: CNDE_INT T0.X, T5.W, PS, T2.W, +; EG-NEXT: SETGT_INT T2.Y, PV.W, literal.x, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, ; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y, ; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44) -; EG-NEXT: XOR_INT T0.X, PV.W, PS, -; EG-NEXT: XOR_INT T2.Y, PV.Z, PS, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X, -; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y, +; EG-NEXT: XOR_INT T1.X, PV.W, PS, +; EG-NEXT: XOR_INT T1.Y, PV.Z, PS, +; EG-NEXT: CNDE_INT T1.Z, PV.Y, 0.0, PV.X, +; EG-NEXT: CNDE_INT T1.W, PV.Y, T0.Z, T0.Y, ; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, ; EG-NEXT: SUB_INT T1.Y, PV.W, PS, ; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y, -; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W, +; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W, ; EG-NEXT: SUB_INT T0.Z, PV.W, PS, -; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W, +; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W, ; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0, ; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W, +; EG-NEXT: SUB_INT * T2.W, T1.X, T2.W, ; EG-NEXT: CNDE_INT T1.Z, T1.Z, PV.W, 0.0, ; EG-NEXT: SUB_INT * T2.W, T0.Y, T3.W, ; EG-NEXT: CNDE_INT T1.X, T0.W, PV.W, 0.0, @@ -567,170 +574,181 @@ ; ; EG-LABEL: fp_to_sint_v4i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1 +; EG-NEXT: ALU 100, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 66, @107, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T0.Z, PS, literal.x, -; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, -; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44) -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, 1, -; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: BFE_UINT * T1.W, KC0[3].Z, literal.x, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T2.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T3.W, KC0[3].Z, literal.y, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: OR_INT T3.W, PS, literal.x, +; EG-NEXT: NOT_INT * T4.W, PV.W, +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, +; EG-NEXT: AND_INT T4.W, PS, literal.x, +; EG-NEXT: LSHR * T5.W, PV.W, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.X, T1.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201 -; EG-NEXT: LSHL T3.W, T0.Z, PV.Z, -; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W, +; EG-NEXT: LSHR T0.Y, PS, PV.W, +; EG-NEXT: AND_INT T1.Z, T2.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_UINT * T2.W, KC0[4].X, literal.z, T0.W, ; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS, -; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W, -; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z, -; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W, -; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x, +; EG-NEXT: LSHL * T4.W, T3.W, T0.Z, +; EG-NEXT: AND_INT T1.Y, KC0[4].X, literal.x, ; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0, -; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS, -; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x, -; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T1.W, PV.Z, -; EG-NEXT: LSHR * T3.W, PV.Y, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z, -; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y, -; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y, -; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W, -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, T0.Z, literal.x, -; EG-NEXT: AND_INT T4.Y, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y, -; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT T5.W, T1.Z, T0.Y, PV.W, +; EG-NEXT: SETGT_INT * T6.W, T0.X, literal.z, +; EG-NEXT: 8388607(1.175494e-38), -150(nan) +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x, +; EG-NEXT: CNDE_INT T0.Y, PS, 0.0, PV.W, +; EG-NEXT: AND_INT T2.Z, PV.Z, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, literal.z, +; EG-NEXT: SUB_INT * T1.W, literal.w, T1.W, +; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44) +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) +; EG-NEXT: NOT_INT T2.X, T0.Z, +; EG-NEXT: SUB_INT T1.Y, literal.x, T2.W, +; EG-NEXT: AND_INT T3.Z, PS, literal.y, +; EG-NEXT: LSHL T7.W, PV.W, PV.Z, +; EG-NEXT: AND_INT * T8.W, T0.Z, literal.z, +; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T2.X, PV.W, PS, -; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0, -; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0, -; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122 -; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: LSHR T2.Y, T3.W, PV.Z, +; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: AND_INT T3.W, PV.X, literal.x, +; EG-NEXT: LSHR * T9.W, T5.W, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.X, T1.W, literal.x, +; EG-NEXT: ADD_INT T3.Y, T2.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T1.W, T5.W, PV.Z, BS:VEC_201 +; EG-NEXT: AND_INT * T2.W, T1.Y, literal.x, +; EG-NEXT: 32(4.484155e-44), -127(nan) +; EG-NEXT: CNDE_INT T4.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, T8.W, PV.Z, T7.W, +; EG-NEXT: SETGT_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: CNDE_INT T1.W, T1.Z, T4.W, 0.0, +; EG-NEXT: CNDE_INT * T2.W, PV.X, T2.Y, 0.0, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W, -; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y, -; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z, -; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: CNDE_INT T2.X, T6.W, PS, PV.W, +; EG-NEXT: ASHR T2.Y, KC0[3].Z, literal.x, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T3.X, +; EG-NEXT: ASHR * T2.W, KC0[4].X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W, -; EG-NEXT: XOR_INT T1.Y, PV.W, PS, -; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: OR_INT T0.W, PV.Y, literal.y, -; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X, -; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT T3.Y, PS, literal.y, -; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS, -; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W, -; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W, -; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: SUB_INT T5.X, PV.W, PS, -; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, -; EG-NEXT: OR_INT T1.W, PV.X, literal.x, -; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: ADD_INT T4.X, T3.X, literal.x, -; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X, -; EG-NEXT: AND_INT T2.Z, PS, literal.z, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) +; EG-NEXT: XOR_INT T3.X, PV.W, PS, +; EG-NEXT: XOR_INT T1.Y, PV.Z, PS, +; EG-NEXT: XOR_INT T0.Z, PV.X, PV.Y, +; EG-NEXT: BFE_UINT T1.W, KC0[3].W, literal.x, T0.W, +; EG-NEXT: XOR_INT * T3.W, T0.Y, PV.Y, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T2.X, PV.W, literal.x, +; EG-NEXT: SUB_INT T0.Y, PS, T2.Y, +; EG-NEXT: SUBB_UINT T1.Z, PV.Z, T2.Y, +; EG-NEXT: SUB_INT T3.W, PV.Y, T2.W, +; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, +; EG-NEXT: -150(nan), 0(0.000000e+00) +; EG-NEXT: SUB_INT T4.X, PV.W, PS, +; EG-NEXT: SUB_INT T0.Y, PV.Y, PV.Z, +; EG-NEXT: AND_INT T1.Z, PV.X, literal.x, +; EG-NEXT: BFE_UINT T0.W, KC0[3].Y, literal.y, T0.W, +; EG-NEXT: OR_INT * T3.W, T1.X, literal.z, +; EG-NEXT: 31(4.344025e-44), 23(3.222986e-44) +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: ADD_INT T1.Y, PV.W, literal.x, +; EG-NEXT: AND_INT T2.Z, KC0[3].Y, literal.y, +; EG-NEXT: LSHL T4.W, PS, PV.Z, +; EG-NEXT: AND_INT * T5.W, T2.X, literal.z, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: NOT_INT T4.Y, T2.X, +; EG-NEXT: OR_INT T1.Z, PV.Z, literal.x, +; EG-NEXT: NOT_INT T6.W, PV.Y, +; EG-NEXT: SUB_INT * T7.W, literal.y, T0.W, +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) +; EG-NEXT: ADD_INT T2.X, T0.W, literal.x, +; EG-NEXT: AND_INT T5.Y, T1.Y, literal.y, +; EG-NEXT: AND_INT * T2.Z, PS, literal.y, +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: ALU clause starting at 107: +; EG-NEXT: AND_INT T0.W, T6.W, literal.x, +; EG-NEXT: LSHR * T6.W, T1.Z, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T4.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x, +; EG-NEXT: LSHR T5.X, PS, PV.W, +; EG-NEXT: LSHR T6.Y, T1.Z, T2.Z, +; EG-NEXT: AND_INT T2.Z, T7.W, literal.x, +; EG-NEXT: LSHL T0.W, T1.Z, T5.Y, +; EG-NEXT: AND_INT * T6.W, T1.Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T6.X, T1.X, literal.x, -; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 108: -; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y, -; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x, -; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z, -; EG-NEXT: AND_INT T2.Z, T6.X, literal.x, -; EG-NEXT: NOT_INT T1.W, T6.X, -; EG-NEXT: LSHR * T3.W, T0.W, 1, +; EG-NEXT: SUB_INT T6.X, literal.x, T1.W, +; EG-NEXT: CNDE_INT T1.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, +; EG-NEXT: SETGT_INT * T6.W, T2.X, literal.y, +; EG-NEXT: 150(2.101948e-43), 23(3.222986e-44) +; EG-NEXT: CNDE_INT T5.X, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T1.Y, PS, PV.Z, PV.Y, +; EG-NEXT: AND_INT T1.Z, PV.X, literal.x, +; EG-NEXT: AND_INT T0.W, T4.Y, literal.x, +; EG-NEXT: LSHR * T6.W, T3.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x, -; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T0.W, PV.Z, +; EG-NEXT: ADD_INT T4.Y, T1.W, literal.y, +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T0.W, T3.W, PV.Z, BS:VEC_120/SCL_212 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, ; EG-NEXT: 31(4.344025e-44), -127(nan) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x, -; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X, -; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, +; EG-NEXT: CNDE_INT T6.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T5.Y, T5.W, PV.Z, T4.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, T1.Y, PV.X, +; EG-NEXT: XOR_INT * T1.W, T5.X, PV.X, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T3.X, PS, T7.X, -; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X, -; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X, +; EG-NEXT: SUB_INT T5.X, PS, T7.X, +; EG-NEXT: SUBB_UINT T1.Y, PV.W, T7.X, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, ; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T1.X, PV.W, PS, ; EG-NEXT: XOR_INT T5.Y, PV.Z, PS, -; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y, -; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0, -; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: SUB_INT T1.Z, PV.X, PV.Y, +; EG-NEXT: SETGT_INT T1.W, 0.0, T2.X, +; EG-NEXT: CNDE_INT * T6.W, T0.X, T0.Y, 0.0, +; EG-NEXT: SETGT_INT T2.X, 0.0, T3.Y, ; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W, -; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W, -; EG-NEXT: SUB_INT T3.X, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y, -; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0, +; EG-NEXT: SUB_INT T0.Z, T0.Z, T2.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T4.W, PV.Y, T3.W, +; EG-NEXT: SUBB_UINT * T5.W, PV.X, T3.W, +; EG-NEXT: SUB_INT T5.X, PV.W, PS, +; EG-NEXT: SETGT_INT T0.Y, 0.0, T4.Y, +; EG-NEXT: CNDE_INT T6.Z, T0.X, PV.Z, 0.0, ; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0, +; EG-NEXT: CNDE_INT * T4.W, PV.X, T4.X, 0.0, ; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0, ; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0, -; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT T0.W, T3.X, T2.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.Z, T2.X, PV.W, 0.0, ; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.X, T0.Y, PV.W, 0.0, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = fptosi <4 x float> %x to <4 x i64> store <4 x i64> %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -186,39 +186,41 @@ ; ; EG-LABEL: fp_to_uint_f32_to_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 42, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T0.W, KC0[2].Z, literal.x, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.y, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T1.W, PS, literal.x, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) +; EG-NEXT: BFE_UINT * T0.W, KC0[2].Z, literal.x, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T2.W, KC0[2].Z, literal.y, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: OR_INT T0.Z, PS, literal.x, +; EG-NEXT: NOT_INT T2.W, PV.W, +; EG-NEXT: SUB_INT * T3.W, literal.y, T0.W, +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) ; EG-NEXT: ADD_INT T0.X, T0.W, literal.x, -; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W, -; EG-NEXT: AND_INT T0.Z, PS, literal.z, -; EG-NEXT: NOT_INT T0.W, PS, -; EG-NEXT: LSHR * T3.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T1.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT T0.Y, T1.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T1.Z, PS, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.y, +; EG-NEXT: LSHR * T2.W, PV.Z, 1, +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: LSHR T1.X, PS, PV.W, +; EG-NEXT: LSHR T1.Y, T0.Z, PV.Z, +; EG-NEXT: AND_INT T1.Z, T3.W, literal.x, +; EG-NEXT: LSHL T0.W, T0.Z, PV.Y, +; EG-NEXT: AND_INT * T1.W, T1.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: CNDE_INT T0.W, PV.Z, PV.X, PV.Y, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, ; EG-NEXT: SETGT_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T0.W, PS, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT T1.Z, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T0.W, PS, PV.Z, PV.Y, ; EG-NEXT: ASHR * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.W, PV.W, PS, @@ -288,7 +290,7 @@ ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 80, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -298,72 +300,77 @@ ; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z, -; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44) +; EG-NEXT: SUB_INT T2.W, literal.y, PV.W, +; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, +; EG-NEXT: 8388607(1.175494e-38), 150(2.101948e-43) ; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: SUB_INT T0.X, literal.x, PV.W, -; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: OR_INT T3.W, PV.Z, literal.z, -; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w, -; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) -; EG-NEXT: 8388608(1.175494e-38), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T1.X, PS, literal.x, -; EG-NEXT: LSHL T1.Y, PV.W, PV.Z, +; EG-NEXT: AND_INT T0.X, KC0[3].X, literal.x, +; EG-NEXT: AND_INT T0.Y, PS, literal.y, +; EG-NEXT: AND_INT T1.Z, PV.W, literal.y, +; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.z, T0.W, +; EG-NEXT: OR_INT * T4.W, PV.Z, literal.w, +; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44) +; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) +; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, +; EG-NEXT: LSHR T1.Y, PS, PV.Z, ; EG-NEXT: AND_INT T0.Z, T2.W, literal.y, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y, -; EG-NEXT: AND_INT * T5.W, PV.Y, literal.y, -; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0, -; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X, -; EG-NEXT: AND_INT * T5.W, T0.X, literal.y, +; EG-NEXT: LSHL T2.W, PS, PV.Y, +; EG-NEXT: AND_INT * T5.W, T3.W, literal.y, ; EG-NEXT: -150(nan), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, -; EG-NEXT: NOT_INT T2.Y, T2.W, -; EG-NEXT: AND_INT T2.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T2.W, PV.Z, -; EG-NEXT: LSHR * T4.W, T1.X, 1, +; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: AND_INT T6.W, PV.X, literal.x, +; EG-NEXT: OR_INT * T7.W, T0.X, literal.y, +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) +; EG-NEXT: NOT_INT T0.X, T1.X, +; EG-NEXT: SUB_INT T1.Y, literal.x, T0.W, +; EG-NEXT: NOT_INT T1.Z, T3.W, +; EG-NEXT: LSHL T3.W, PS, PV.W, +; EG-NEXT: AND_INT * T6.W, T1.X, literal.y, +; EG-NEXT: 150(2.101948e-43), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: AND_INT T2.Y, PV.Z, literal.x, +; EG-NEXT: AND_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: AND_INT T8.W, PV.X, literal.x, +; EG-NEXT: LSHR * T9.W, T7.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, T3.W, 1, +; EG-NEXT: LSHR T0.X, T4.W, 1, ; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T1.X, PV.Z, -; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y, +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T0.W, T7.W, PV.Z, BS:VEC_201 +; EG-NEXT: AND_INT * T4.W, T1.Y, literal.y, ; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, +; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, T6.W, PV.Z, T3.W, ; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y, -; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y, +; EG-NEXT: ADD_INT T0.W, T1.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T1.W, PV.X, T2.Y, ; EG-NEXT: 23(3.222986e-44), -127(nan) -; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y, -; EG-NEXT: SETGT_INT T1.Y, PS, literal.x, -; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X, +; EG-NEXT: CNDE_INT T0.X, T5.W, PS, T2.W, +; EG-NEXT: SETGT_INT T2.Y, PV.W, literal.x, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, ; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y, ; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44) -; EG-NEXT: XOR_INT T0.X, PV.W, PS, -; EG-NEXT: XOR_INT T2.Y, PV.Z, PS, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X, -; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y, +; EG-NEXT: XOR_INT T1.X, PV.W, PS, +; EG-NEXT: XOR_INT T1.Y, PV.Z, PS, +; EG-NEXT: CNDE_INT T1.Z, PV.Y, 0.0, PV.X, +; EG-NEXT: CNDE_INT T1.W, PV.Y, T0.Z, T0.Y, ; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, ; EG-NEXT: SUB_INT T1.Y, PV.W, PS, ; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y, -; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W, +; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W, ; EG-NEXT: SUB_INT T0.Z, PV.W, PS, -; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W, +; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W, ; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0, ; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W, +; EG-NEXT: SUB_INT * T2.W, T1.X, T2.W, ; EG-NEXT: CNDE_INT T1.Z, T1.Z, PV.W, 0.0, ; EG-NEXT: SUB_INT * T2.W, T0.Y, T3.W, ; EG-NEXT: CNDE_INT T1.X, T0.W, PV.W, 0.0, @@ -449,170 +456,181 @@ ; ; EG-LABEL: fp_to_uint_v4f32_to_v4i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1 +; EG-NEXT: ALU 100, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 66, @107, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T0.Z, PS, literal.x, -; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, -; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44) -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, 1, -; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: BFE_UINT * T1.W, KC0[3].Z, literal.x, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T2.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T3.W, KC0[3].Z, literal.y, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: OR_INT T3.W, PS, literal.x, +; EG-NEXT: NOT_INT * T4.W, PV.W, +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, +; EG-NEXT: AND_INT T4.W, PS, literal.x, +; EG-NEXT: LSHR * T5.W, PV.W, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.X, T1.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201 -; EG-NEXT: LSHL T3.W, T0.Z, PV.Z, -; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W, +; EG-NEXT: LSHR T0.Y, PS, PV.W, +; EG-NEXT: AND_INT T1.Z, T2.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_UINT * T2.W, KC0[4].X, literal.z, T0.W, ; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS, -; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W, -; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z, -; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W, -; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x, +; EG-NEXT: LSHL * T4.W, T3.W, T0.Z, +; EG-NEXT: AND_INT T1.Y, KC0[4].X, literal.x, ; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0, -; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS, -; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x, -; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T1.W, PV.Z, -; EG-NEXT: LSHR * T3.W, PV.Y, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z, -; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y, -; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y, -; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W, -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, T0.Z, literal.x, -; EG-NEXT: AND_INT T4.Y, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y, -; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT T5.W, T1.Z, T0.Y, PV.W, +; EG-NEXT: SETGT_INT * T6.W, T0.X, literal.z, +; EG-NEXT: 8388607(1.175494e-38), -150(nan) +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x, +; EG-NEXT: CNDE_INT T0.Y, PS, 0.0, PV.W, +; EG-NEXT: AND_INT T2.Z, PV.Z, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, literal.z, +; EG-NEXT: SUB_INT * T1.W, literal.w, T1.W, +; EG-NEXT: 8388607(1.175494e-38), 31(4.344025e-44) +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) +; EG-NEXT: NOT_INT T2.X, T0.Z, +; EG-NEXT: SUB_INT T1.Y, literal.x, T2.W, +; EG-NEXT: AND_INT T3.Z, PS, literal.y, +; EG-NEXT: LSHL T7.W, PV.W, PV.Z, +; EG-NEXT: AND_INT * T8.W, T0.Z, literal.z, +; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T2.X, PV.W, PS, -; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0, -; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0, -; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122 -; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: LSHR T2.Y, T3.W, PV.Z, +; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: AND_INT T3.W, PV.X, literal.x, +; EG-NEXT: LSHR * T9.W, T5.W, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.X, T1.W, literal.x, +; EG-NEXT: ADD_INT T3.Y, T2.W, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T1.W, T5.W, PV.Z, BS:VEC_201 +; EG-NEXT: AND_INT * T2.W, T1.Y, literal.x, +; EG-NEXT: 32(4.484155e-44), -127(nan) +; EG-NEXT: CNDE_INT T4.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, T8.W, PV.Z, T7.W, +; EG-NEXT: SETGT_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: CNDE_INT T1.W, T1.Z, T4.W, 0.0, +; EG-NEXT: CNDE_INT * T2.W, PV.X, T2.Y, 0.0, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W, -; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y, -; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z, -; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: CNDE_INT T2.X, T6.W, PS, PV.W, +; EG-NEXT: ASHR T2.Y, KC0[3].Z, literal.x, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T3.X, +; EG-NEXT: ASHR * T2.W, KC0[4].X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W, -; EG-NEXT: XOR_INT T1.Y, PV.W, PS, -; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: OR_INT T0.W, PV.Y, literal.y, -; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X, -; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT T3.Y, PS, literal.y, -; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS, -; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W, -; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W, -; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: SUB_INT T5.X, PV.W, PS, -; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, -; EG-NEXT: OR_INT T1.W, PV.X, literal.x, -; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: ADD_INT T4.X, T3.X, literal.x, -; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X, -; EG-NEXT: AND_INT T2.Z, PS, literal.z, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) +; EG-NEXT: XOR_INT T3.X, PV.W, PS, +; EG-NEXT: XOR_INT T1.Y, PV.Z, PS, +; EG-NEXT: XOR_INT T0.Z, PV.X, PV.Y, +; EG-NEXT: BFE_UINT T1.W, KC0[3].W, literal.x, T0.W, +; EG-NEXT: XOR_INT * T3.W, T0.Y, PV.Y, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T2.X, PV.W, literal.x, +; EG-NEXT: SUB_INT T0.Y, PS, T2.Y, +; EG-NEXT: SUBB_UINT T1.Z, PV.Z, T2.Y, +; EG-NEXT: SUB_INT T3.W, PV.Y, T2.W, +; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, +; EG-NEXT: -150(nan), 0(0.000000e+00) +; EG-NEXT: SUB_INT T4.X, PV.W, PS, +; EG-NEXT: SUB_INT T0.Y, PV.Y, PV.Z, +; EG-NEXT: AND_INT T1.Z, PV.X, literal.x, +; EG-NEXT: BFE_UINT T0.W, KC0[3].Y, literal.y, T0.W, +; EG-NEXT: OR_INT * T3.W, T1.X, literal.z, +; EG-NEXT: 31(4.344025e-44), 23(3.222986e-44) +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: ADD_INT T1.Y, PV.W, literal.x, +; EG-NEXT: AND_INT T2.Z, KC0[3].Y, literal.y, +; EG-NEXT: LSHL T4.W, PS, PV.Z, +; EG-NEXT: AND_INT * T5.W, T2.X, literal.z, +; EG-NEXT: -150(nan), 8388607(1.175494e-38) +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: NOT_INT T4.Y, T2.X, +; EG-NEXT: OR_INT T1.Z, PV.Z, literal.x, +; EG-NEXT: NOT_INT T6.W, PV.Y, +; EG-NEXT: SUB_INT * T7.W, literal.y, T0.W, +; EG-NEXT: 8388608(1.175494e-38), 150(2.101948e-43) +; EG-NEXT: ADD_INT T2.X, T0.W, literal.x, +; EG-NEXT: AND_INT T5.Y, T1.Y, literal.y, +; EG-NEXT: AND_INT * T2.Z, PS, literal.y, +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: ALU clause starting at 107: +; EG-NEXT: AND_INT T0.W, T6.W, literal.x, +; EG-NEXT: LSHR * T6.W, T1.Z, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T4.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x, +; EG-NEXT: LSHR T5.X, PS, PV.W, +; EG-NEXT: LSHR T6.Y, T1.Z, T2.Z, +; EG-NEXT: AND_INT T2.Z, T7.W, literal.x, +; EG-NEXT: LSHL T0.W, T1.Z, T5.Y, +; EG-NEXT: AND_INT * T6.W, T1.Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T6.X, T1.X, literal.x, -; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 108: -; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y, -; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x, -; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z, -; EG-NEXT: AND_INT T2.Z, T6.X, literal.x, -; EG-NEXT: NOT_INT T1.W, T6.X, -; EG-NEXT: LSHR * T3.W, T0.W, 1, +; EG-NEXT: SUB_INT T6.X, literal.x, T1.W, +; EG-NEXT: CNDE_INT T1.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, +; EG-NEXT: SETGT_INT * T6.W, T2.X, literal.y, +; EG-NEXT: 150(2.101948e-43), 23(3.222986e-44) +; EG-NEXT: CNDE_INT T5.X, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T1.Y, PS, PV.Z, PV.Y, +; EG-NEXT: AND_INT T1.Z, PV.X, literal.x, +; EG-NEXT: AND_INT T0.W, T4.Y, literal.x, +; EG-NEXT: LSHR * T6.W, T3.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x, -; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T0.W, PV.Z, +; EG-NEXT: ADD_INT T4.Y, T1.W, literal.y, +; EG-NEXT: LSHR T2.Z, PS, PV.W, +; EG-NEXT: LSHR T0.W, T3.W, PV.Z, BS:VEC_120/SCL_212 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, ; EG-NEXT: 31(4.344025e-44), -127(nan) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x, -; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X, -; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, +; EG-NEXT: CNDE_INT T6.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T5.Y, T5.W, PV.Z, T4.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, T1.Y, PV.X, +; EG-NEXT: XOR_INT * T1.W, T5.X, PV.X, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T3.X, PS, T7.X, -; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X, -; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X, +; EG-NEXT: SUB_INT T5.X, PS, T7.X, +; EG-NEXT: SUBB_UINT T1.Y, PV.W, T7.X, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, ; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T1.X, PV.W, PS, ; EG-NEXT: XOR_INT T5.Y, PV.Z, PS, -; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y, -; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0, -; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: SUB_INT T1.Z, PV.X, PV.Y, +; EG-NEXT: SETGT_INT T1.W, 0.0, T2.X, +; EG-NEXT: CNDE_INT * T6.W, T0.X, T0.Y, 0.0, +; EG-NEXT: SETGT_INT T2.X, 0.0, T3.Y, ; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W, -; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W, -; EG-NEXT: SUB_INT T3.X, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y, -; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0, +; EG-NEXT: SUB_INT T0.Z, T0.Z, T2.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T4.W, PV.Y, T3.W, +; EG-NEXT: SUBB_UINT * T5.W, PV.X, T3.W, +; EG-NEXT: SUB_INT T5.X, PV.W, PS, +; EG-NEXT: SETGT_INT T0.Y, 0.0, T4.Y, +; EG-NEXT: CNDE_INT T6.Z, T0.X, PV.Z, 0.0, ; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0, +; EG-NEXT: CNDE_INT * T4.W, PV.X, T4.X, 0.0, ; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0, ; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0, -; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT T0.W, T3.X, T2.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.Z, T2.X, PV.W, 0.0, ; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.X, T0.Y, PV.W, 0.0, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = fptoui <4 x float> %x to <4 x i64> store <4 x i64> %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,14 +18,17 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_lshr_b32 s4, s7, 1 +; SI-NEXT: s_lshl_b32 s5, s6, 31 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_not_b32 s5, s8 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: s_lshr_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s6, -2 +; SI-NEXT: s_lshl_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -34,14 +37,17 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: s_lshr_b32 s1, s6, 1 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s7, 1 +; VI-NEXT: s_lshl_b32 s2, s6, 31 +; VI-NEXT: s_not_b32 s3, s0 +; VI-NEXT: s_and_b32 s6, s6, -2 +; VI-NEXT: s_or_b32 s1, s2, s1 +; VI-NEXT: s_lshr_b32 s1, s1, s3 +; VI-NEXT: s_lshl_b32 s0, s6, s0 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -51,26 +57,37 @@ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_not_b32 s1, s2 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 +; GFX9-NEXT: s_lshl_b32 s1, s6, 31 +; GFX9-NEXT: s_not_b32 s3, s2 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s6, -2 +; GFX9-NEXT: s_lshr_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, -; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, +; R600-NEXT: LSHL T0.Z, KC0[2].Z, literal.x, +; R600-NEXT: LSHR T0.W, KC0[2].W, 1, ; R600-NEXT: NOT_INT * T1.W, KC0[3].X, -; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T0.Y, KC0[2].Z, literal.x, +; R600-NEXT: AND_INT T1.Z, KC0[3].X, literal.y, +; R600-NEXT: AND_INT T1.W, PS, literal.y, +; R600-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; R600-NEXT: -2(nan), 31(4.344025e-44) +; R600-NEXT: LSHR T0.W, PS, PV.W, +; R600-NEXT: LSHL * T1.W, PV.Y, PV.Z, +; R600-NEXT: OR_INT T0.X, PS, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -79,13 +96,18 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX10-NEXT: s_lshr_b32 s0, s6, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_lshr_b32 s0, s7, 1 +; GFX10-NEXT: s_lshl_b32 s1, s6, 31 +; GFX10-NEXT: s_not_b32 s3, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_and_b32 s1, s6, -2 +; GFX10-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: @@ -93,14 +115,18 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX11-NEXT: s_lshr_b32 s1, s6, 1 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_lshr_b32 s1, s7, 1 +; GFX11-NEXT: s_lshl_b32 s2, s6, 31 +; GFX11-NEXT: s_not_b32 s3, s0 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s6, -2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s2, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -116,10 +142,12 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: s_lshr_b32 s0, s3, 25 +; SI-NEXT: s_lshl_b32 s1, s2, 7 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -127,10 +155,12 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: s_lshr_b32 s3, s3, 25 +; VI-NEXT: s_lshl_b32 s2, s2, 7 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -139,38 +169,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: s_lshr_b32 s3, s3, 25 +; GFX9-NEXT: s_lshl_b32 s2, s2, 7 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: LSHL T0.W, KC0[2].Z, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[2].W, literal.y, +; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44) +; R600-NEXT: OR_INT T0.X, PV.W, PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, -; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: s_lshr_b32 s3, s3, 25 +; GFX10-NEXT: s_lshl_b32 s2, s2, 7 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: s_lshr_b32 s3, s3, 25 +; GFX11-NEXT: s_lshl_b32 s2, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -189,18 +229,24 @@ ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_not_b32 s1, s1 -; SI-NEXT: s_lshr_b32 s2, s5, 1 +; SI-NEXT: s_lshr_b32 s2, s7, 1 +; SI-NEXT: s_lshl_b32 s3, s5, 31 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_not_b32 s3, s1 +; SI-NEXT: s_lshr_b32 s2, s2, s3 +; SI-NEXT: s_and_b32 s3, s5, -2 +; SI-NEXT: s_lshl_b32 s1, s3, s1 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_lshr_b32 s2, s6, 1 +; SI-NEXT: s_lshl_b32 s3, s4, 31 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_not_b32 s3, s0 +; SI-NEXT: s_lshr_b32 s2, s2, s3 +; SI-NEXT: s_and_b32 s3, s4, -2 +; SI-NEXT: s_lshl_b32 s0, s3, s0 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_not_b32 s0, s0 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s1, s4, 1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -210,19 +256,25 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s7, s5, 1 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s3, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 +; VI-NEXT: s_lshl_b32 s8, s5, 31 +; VI-NEXT: s_and_b32 s5, s5, -2 +; VI-NEXT: s_lshr_b32 s7, s7, 1 +; VI-NEXT: s_not_b32 s9, s3 +; VI-NEXT: s_lshl_b32 s3, s5, s3 +; VI-NEXT: s_lshr_b32 s5, s6, 1 +; VI-NEXT: s_lshl_b32 s6, s4, 31 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_not_b32 s6, s2 +; VI-NEXT: s_and_b32 s4, s4, -2 +; VI-NEXT: s_lshr_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s5, s5, s6 +; VI-NEXT: s_lshl_b32 s2, s4, s2 +; VI-NEXT: s_or_b32 s3, s3, s7 +; VI-NEXT: s_or_b32 s2, s2, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -234,36 +286,58 @@ ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 +; GFX9-NEXT: s_lshl_b32 s1, s5, 31 +; GFX9-NEXT: s_or_b32 s0, s1, s0 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s5, -2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s9 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s1, s6, 1 +; GFX9-NEXT: s_lshl_b32 s5, s4, 31 +; GFX9-NEXT: s_or_b32 s1, s5, s1 +; GFX9-NEXT: s_not_b32 s5, s8 +; GFX9-NEXT: s_and_b32 s4, s4, -2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s4, s4, s8 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, -; R600-NEXT: NOT_INT * T1.W, KC0[4].X, -; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, -; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, -; R600-NEXT: NOT_INT * T1.W, KC0[3].W, -; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, +; R600-NEXT: AND_INT T0.W, KC0[2].W, literal.x, +; R600-NEXT: AND_INT * T1.W, KC0[4].X, literal.y, +; R600-NEXT: -2(nan), 31(4.344025e-44) +; R600-NEXT: AND_INT T0.Y, KC0[3].X, literal.x, +; R600-NEXT: LSHL T0.Z, KC0[3].X, literal.y, +; R600-NEXT: LSHR * T2.W, KC0[3].Z, 1, +; R600-NEXT: -2(nan), 31(4.344025e-44) +; R600-NEXT: NOT_INT * T3.W, KC0[4].X, +; R600-NEXT: AND_INT T0.X, PV.W, literal.x, +; R600-NEXT: OR_INT T1.Y, T0.Z, T2.W, +; R600-NEXT: LSHL T0.Z, KC0[2].W, literal.x, +; R600-NEXT: LSHR * T2.W, KC0[3].Y, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: NOT_INT * T3.W, KC0[3].W, +; R600-NEXT: AND_INT T1.X, KC0[3].W, literal.x, +; R600-NEXT: AND_INT T2.Y, PV.W, literal.x, +; R600-NEXT: OR_INT T0.Z, T0.Z, T2.W, BS:VEC_021/SCL_122 +; R600-NEXT: LSHR T2.W, T1.Y, T0.X, +; R600-NEXT: LSHL * T1.W, T0.Y, T1.W, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T0.Y, PS, PV.W, +; R600-NEXT: LSHR T1.W, PV.Z, PV.Y, +; R600-NEXT: LSHL * T0.W, T0.W, PV.X, +; R600-NEXT: OR_INT T0.X, PS, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -275,14 +349,24 @@ ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s0, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 +; GFX10-NEXT: s_lshr_b32 s0, s7, 1 +; GFX10-NEXT: s_lshl_b32 s1, s5, 31 +; GFX10-NEXT: s_and_b32 s5, s5, -2 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_lshl_b32 s10, s4, 31 +; GFX10-NEXT: s_not_b32 s7, s3 +; GFX10-NEXT: s_not_b32 s11, s2 +; GFX10-NEXT: s_and_b32 s4, s4, -2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s1, s5, s3 +; GFX10-NEXT: s_or_b32 s3, s10, s6 +; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_lshr_b32 s3, s3, s11 +; GFX10-NEXT: s_lshr_b32 s0, s0, s7 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; @@ -292,16 +376,26 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s3, s3 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 +; GFX11-NEXT: s_lshr_b32 s7, s7, 1 +; GFX11-NEXT: s_lshl_b32 s8, s5, 31 +; GFX11-NEXT: s_and_b32 s5, s5, -2 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_lshl_b32 s10, s4, 31 +; GFX11-NEXT: s_not_b32 s9, s3 +; GFX11-NEXT: s_not_b32 s11, s2 +; GFX11-NEXT: s_and_b32 s4, s4, -2 +; GFX11-NEXT: s_or_b32 s7, s8, s7 +; GFX11-NEXT: s_lshl_b32 s3, s5, s3 +; GFX11-NEXT: s_or_b32 s5, s10, s6 +; GFX11-NEXT: s_lshl_b32 s2, s4, s2 +; GFX11-NEXT: s_lshr_b32 s4, s5, s11 +; GFX11-NEXT: s_lshr_b32 s5, s7, s9 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -319,10 +413,14 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s4, v2, 25 +; SI-NEXT: s_lshr_b32 s7, s7, 23 +; SI-NEXT: s_lshl_b32 s5, s5, 9 +; SI-NEXT: s_lshr_b32 s6, s6, 25 +; SI-NEXT: s_lshl_b32 s4, s4, 7 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -331,11 +429,15 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 +; VI-NEXT: s_lshr_b32 s2, s7, 23 +; VI-NEXT: s_lshl_b32 s3, s5, 9 +; VI-NEXT: s_lshr_b32 s5, s6, 25 +; VI-NEXT: s_lshl_b32 s4, s4, 7 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s4, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -346,24 +448,32 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 +; GFX9-NEXT: s_lshr_b32 s0, s7, 23 +; GFX9-NEXT: s_lshl_b32 s1, s5, 9 +; GFX9-NEXT: s_lshr_b32 s5, s6, 25 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_lshl_b32 s1, s4, 7 +; GFX9-NEXT: s_or_b32 s1, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, -; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, -; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T0.W, KC0[3].X, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[3].Z, literal.y, +; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44) +; R600-NEXT: OR_INT T0.Y, PV.W, PS, +; R600-NEXT: LSHL T0.W, KC0[2].W, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[3].Y, literal.y, +; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44) +; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -374,8 +484,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 +; GFX10-NEXT: s_lshr_b32 s0, s7, 23 +; GFX10-NEXT: s_lshr_b32 s1, s6, 25 +; GFX10-NEXT: s_lshl_b32 s4, s4, 7 +; GFX10-NEXT: s_lshl_b32 s5, s5, 9 +; GFX10-NEXT: s_or_b32 s1, s4, s1 +; GFX10-NEXT: s_or_b32 s0, s5, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -384,10 +500,16 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 +; GFX11-NEXT: s_lshr_b32 s2, s7, 23 +; GFX11-NEXT: s_lshr_b32 s3, s6, 25 +; GFX11-NEXT: s_lshl_b32 s4, s4, 7 +; GFX11-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -406,30 +528,42 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_not_b32 s11, s15 -; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s7, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s7, s14 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s6, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s6, s13 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s5, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s5, s12 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: s_lshr_b32 s11, s11, 1 +; SI-NEXT: s_lshl_b32 s16, s7, 31 +; SI-NEXT: s_or_b32 s11, s16, s11 +; SI-NEXT: s_not_b32 s16, s15 +; SI-NEXT: s_and_b32 s7, s7, -2 +; SI-NEXT: s_lshr_b32 s11, s11, s16 +; SI-NEXT: s_lshl_b32 s7, s7, s15 +; SI-NEXT: s_or_b32 s7, s7, s11 +; SI-NEXT: s_lshr_b32 s10, s10, 1 +; SI-NEXT: s_lshl_b32 s11, s6, 31 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_not_b32 s11, s14 +; SI-NEXT: s_and_b32 s6, s6, -2 +; SI-NEXT: s_lshr_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s6, s6, s14 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_lshr_b32 s9, s9, 1 +; SI-NEXT: s_lshl_b32 s10, s5, 31 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_not_b32 s10, s13 +; SI-NEXT: s_and_b32 s5, s5, -2 +; SI-NEXT: s_lshr_b32 s9, s9, s10 +; SI-NEXT: s_lshl_b32 s5, s5, s13 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_lshr_b32 s8, s8, 1 +; SI-NEXT: s_lshl_b32 s9, s4, 31 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_not_b32 s9, s12 +; SI-NEXT: s_and_b32 s4, s4, -2 +; SI-NEXT: s_lshr_b32 s8, s8, s9 +; SI-NEXT: s_lshl_b32 s4, s4, s12 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -439,31 +573,43 @@ ; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_not_b32 s3, s15 -; VI-NEXT: s_lshr_b32 s2, s7, 1 -; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_not_b32 s3, s14 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s6, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_not_b32 s3, s13 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s5, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_not_b32 s3, s12 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 +; VI-NEXT: s_lshr_b32 s2, s11, 1 +; VI-NEXT: s_lshl_b32 s3, s7, 31 +; VI-NEXT: s_not_b32 s11, s15 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_and_b32 s3, s7, -2 +; VI-NEXT: s_lshr_b32 s2, s2, s11 +; VI-NEXT: s_lshl_b32 s3, s3, s15 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s3, s10, 1 +; VI-NEXT: s_lshl_b32 s7, s6, 31 +; VI-NEXT: s_or_b32 s3, s7, s3 +; VI-NEXT: s_not_b32 s7, s14 +; VI-NEXT: s_and_b32 s6, s6, -2 +; VI-NEXT: s_lshr_b32 s3, s3, s7 +; VI-NEXT: s_lshl_b32 s6, s6, s14 +; VI-NEXT: s_or_b32 s3, s6, s3 +; VI-NEXT: s_lshr_b32 s6, s9, 1 +; VI-NEXT: s_lshl_b32 s7, s5, 31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_not_b32 s7, s13 +; VI-NEXT: s_and_b32 s5, s5, -2 +; VI-NEXT: s_lshr_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s5, s5, s13 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_lshr_b32 s6, s8, 1 +; VI-NEXT: s_lshl_b32 s7, s4, 31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_not_b32 s7, s12 +; VI-NEXT: s_and_b32 s4, s4, -2 +; VI-NEXT: s_lshr_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s4, s4, s12 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -475,57 +621,101 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s11, 1 +; GFX9-NEXT: s_lshl_b32 s1, s7, 31 +; GFX9-NEXT: s_or_b32 s0, s1, s0 ; GFX9-NEXT: s_not_b32 s1, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s0, s7, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s1, s14 -; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s13 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s1, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s7, -2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s15 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s1, s10, 1 +; GFX9-NEXT: s_lshl_b32 s7, s6, 31 +; GFX9-NEXT: s_or_b32 s1, s7, s1 +; GFX9-NEXT: s_not_b32 s7, s14 +; GFX9-NEXT: s_and_b32 s6, s6, -2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s7 +; GFX9-NEXT: s_lshl_b32 s6, s6, s14 +; GFX9-NEXT: s_or_b32 s1, s6, s1 +; GFX9-NEXT: s_lshr_b32 s6, s9, 1 +; GFX9-NEXT: s_lshl_b32 s7, s5, 31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_not_b32 s7, s13 +; GFX9-NEXT: s_and_b32 s5, s5, -2 +; GFX9-NEXT: s_lshr_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s5, s5, s13 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s8, 1 +; GFX9-NEXT: s_lshl_b32 s7, s4, 31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_not_b32 s7, s12 +; GFX9-NEXT: s_and_b32 s4, s4, -2 +; GFX9-NEXT: s_lshr_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s4, s4, s12 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: ALU 49, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, -; R600-NEXT: NOT_INT * T1.W, KC0[6].X, -; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, -; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, -; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, -; R600-NEXT: NOT_INT * T1.W, KC0[5].W, -; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, -; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, -; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, +; R600-NEXT: AND_INT * T0.W, KC0[5].Z, literal.x, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T0.X, KC0[3].Y, literal.x, +; R600-NEXT: AND_INT T0.Y, KC0[3].Z, literal.x, +; R600-NEXT: LSHL T0.Z, KC0[3].Y, literal.y, +; R600-NEXT: LSHL * T1.W, KC0[3].W, literal.y, +; R600-NEXT: -2(nan), 31(4.344025e-44) ; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, -; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, -; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, -; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, -; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, -; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: AND_INT T1.X, KC0[5].Y, literal.x, +; R600-NEXT: AND_INT T1.Y, PV.W, literal.x, +; R600-NEXT: LSHL T1.Z, KC0[4].X, literal.x, +; R600-NEXT: LSHR * T2.W, KC0[5].X, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: NOT_INT * T3.W, KC0[6].X, +; R600-NEXT: LSHR T2.X, KC0[4].W, 1, +; R600-NEXT: AND_INT T2.Y, PV.W, literal.x, +; R600-NEXT: OR_INT T1.Z, T1.Z, T2.W, +; R600-NEXT: LSHL T2.W, KC0[3].Z, literal.x, +; R600-NEXT: LSHR * T3.W, KC0[4].Z, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T3.X, PV.W, PS, +; R600-NEXT: LSHR T2.Y, PV.Z, PV.Y, +; R600-NEXT: NOT_INT T1.Z, KC0[5].W, +; R600-NEXT: AND_INT * T2.W, KC0[4].X, literal.x, +; R600-NEXT: -2(nan), 0(0.000000e+00) +; R600-NEXT: AND_INT * T3.W, KC0[6].X, literal.x, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T4.X, T2.W, PV.W, +; R600-NEXT: AND_INT T3.Y, KC0[3].W, literal.x, +; R600-NEXT: AND_INT T2.Z, KC0[5].W, literal.y, +; R600-NEXT: AND_INT T2.W, T1.Z, literal.y, +; R600-NEXT: OR_INT * T1.W, T1.W, T2.X, +; R600-NEXT: -2(nan), 31(4.344025e-44) +; R600-NEXT: LSHR T2.X, PS, PV.W, +; R600-NEXT: LSHL T3.Y, PV.Y, PV.Z, +; R600-NEXT: LSHR T1.Z, KC0[4].Y, 1, +; R600-NEXT: NOT_INT T1.W, KC0[5].Y, +; R600-NEXT: OR_INT * T2.W, PV.X, T2.Y, +; R600-NEXT: AND_INT T4.X, PV.W, literal.x, +; R600-NEXT: OR_INT T4.Y, T0.Z, PV.Z, +; R600-NEXT: OR_INT T2.Z, PV.Y, PV.X, +; R600-NEXT: LSHR T1.W, T3.X, T1.Y, +; R600-NEXT: LSHL * T0.W, T0.Y, T0.W, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T2.Y, PS, PV.W, +; R600-NEXT: LSHR T0.W, PV.Y, PV.X, +; R600-NEXT: LSHL * T1.W, T0.X, T1.X, +; R600-NEXT: OR_INT T2.X, PS, PV.W, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshl_v4i32: @@ -536,22 +726,42 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 -; GFX10-NEXT: s_lshr_b32 s2, s7, 1 -; GFX10-NEXT: s_not_b32 s3, s15 -; GFX10-NEXT: s_lshr_b32 s6, s6, 1 -; GFX10-NEXT: s_not_b32 s7, s14 -; GFX10-NEXT: s_lshr_b32 s5, s5, 1 -; GFX10-NEXT: s_not_b32 s9, s13 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 -; GFX10-NEXT: s_not_b32 s8, s12 -; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 -; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 +; GFX10-NEXT: s_lshr_b32 s2, s11, 1 +; GFX10-NEXT: s_lshl_b32 s3, s7, 31 +; GFX10-NEXT: s_and_b32 s7, s7, -2 +; GFX10-NEXT: s_lshr_b32 s10, s10, 1 +; GFX10-NEXT: s_lshl_b32 s16, s6, 31 +; GFX10-NEXT: s_not_b32 s11, s15 +; GFX10-NEXT: s_not_b32 s17, s14 +; GFX10-NEXT: s_and_b32 s6, s6, -2 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, s7, s15 +; GFX10-NEXT: s_or_b32 s7, s16, s10 +; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_lshl_b32 s18, s5, 31 +; GFX10-NEXT: s_lshr_b32 s8, s8, 1 +; GFX10-NEXT: s_lshl_b32 s20, s4, 31 +; GFX10-NEXT: s_lshl_b32 s6, s6, s14 +; GFX10-NEXT: s_lshr_b32 s2, s2, s11 +; GFX10-NEXT: s_lshr_b32 s7, s7, s17 +; GFX10-NEXT: s_not_b32 s19, s13 +; GFX10-NEXT: s_and_b32 s5, s5, -2 +; GFX10-NEXT: s_or_b32 s9, s18, s9 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s6, s7 +; GFX10-NEXT: s_or_b32 s6, s20, s8 +; GFX10-NEXT: s_not_b32 s7, s12 +; GFX10-NEXT: s_and_b32 s4, s4, -2 +; GFX10-NEXT: s_lshr_b32 s9, s9, s19 +; GFX10-NEXT: s_lshr_b32 s6, s6, s7 +; GFX10-NEXT: s_lshl_b32 s4, s4, s12 +; GFX10-NEXT: s_lshl_b32 s5, s5, s13 +; GFX10-NEXT: s_or_b32 s4, s4, s6 +; GFX10-NEXT: s_or_b32 s5, s5, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -561,24 +771,43 @@ ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 ; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1 -; GFX11-NEXT: s_lshr_b32 s2, s7, 1 -; GFX11-NEXT: s_not_b32 s3, s15 -; GFX11-NEXT: s_lshr_b32 s6, s6, 1 -; GFX11-NEXT: s_not_b32 s7, s14 -; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s9, s13 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s8, s12 -; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 +; GFX11-NEXT: s_lshr_b32 s2, s11, 1 +; GFX11-NEXT: s_lshl_b32 s3, s7, 31 +; GFX11-NEXT: s_and_b32 s7, s7, -2 +; GFX11-NEXT: s_lshr_b32 s10, s10, 1 +; GFX11-NEXT: s_lshl_b32 s16, s6, 31 +; GFX11-NEXT: s_not_b32 s11, s15 +; GFX11-NEXT: s_not_b32 s17, s14 +; GFX11-NEXT: s_and_b32 s6, s6, -2 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshl_b32 s3, s7, s15 +; GFX11-NEXT: s_or_b32 s7, s16, s10 +; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_lshl_b32 s18, s5, 31 +; GFX11-NEXT: s_lshr_b32 s8, s8, 1 +; GFX11-NEXT: s_lshl_b32 s20, s4, 31 +; GFX11-NEXT: s_lshl_b32 s6, s6, s14 +; GFX11-NEXT: s_lshr_b32 s2, s2, s11 +; GFX11-NEXT: s_lshr_b32 s7, s7, s17 +; GFX11-NEXT: s_not_b32 s19, s13 +; GFX11-NEXT: s_and_b32 s5, s5, -2 +; GFX11-NEXT: s_or_b32 s9, s18, s9 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s7 +; GFX11-NEXT: s_or_b32 s6, s20, s8 +; GFX11-NEXT: s_not_b32 s7, s12 +; GFX11-NEXT: s_and_b32 s4, s4, -2 +; GFX11-NEXT: s_lshr_b32 s9, s9, s19 +; GFX11-NEXT: s_lshr_b32 s6, s6, s7 +; GFX11-NEXT: s_lshl_b32 s4, s4, s12 +; GFX11-NEXT: s_lshl_b32 s5, s5, s13 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -596,14 +825,22 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; SI-NEXT: s_lshr_b32 s11, s11, 31 +; SI-NEXT: s_lshl_b32 s7, s7, 1 +; SI-NEXT: s_lshr_b32 s10, s10, 23 +; SI-NEXT: s_lshl_b32 s6, s6, 9 +; SI-NEXT: s_lshr_b32 s9, s9, 25 +; SI-NEXT: s_lshl_b32 s5, s5, 7 +; SI-NEXT: s_lshr_b32 s8, s8, 31 +; SI-NEXT: s_lshl_b32 s4, s4, 1 +; SI-NEXT: s_or_b32 s7, s7, s11 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -612,15 +849,23 @@ ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_lshr_b32 s2, s11, 31 +; VI-NEXT: s_lshl_b32 s3, s7, 1 +; VI-NEXT: s_lshr_b32 s7, s10, 23 +; VI-NEXT: s_lshl_b32 s6, s6, 9 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s6, s7 +; VI-NEXT: s_lshr_b32 s6, s9, 25 +; VI-NEXT: s_lshl_b32 s5, s5, 7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_lshr_b32 s6, s8, 31 +; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -631,47 +876,74 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; GFX9-NEXT: s_lshr_b32 s2, s11, 31 +; GFX9-NEXT: s_lshl_b32 s3, s7, 1 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s3, s6, 9 +; GFX9-NEXT: s_lshr_b32 s6, s9, 25 +; GFX9-NEXT: s_lshl_b32 s5, s5, 7 +; GFX9-NEXT: s_lshr_b32 s7, s10, 23 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s8, 31 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_or_b32 s3, s3, s7 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, +; R600-NEXT: LSHL T0.W, KC0[4].X, 1, +; R600-NEXT: LSHR * T1.W, KC0[5].X, literal.x, ; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, -; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, -; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, +; R600-NEXT: LSHL T0.Z, KC0[3].W, literal.x, +; R600-NEXT: LSHR T2.W, KC0[4].W, literal.y, +; R600-NEXT: OR_INT * T0.W, PV.W, PS, +; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44) +; R600-NEXT: OR_INT T0.Z, PV.Z, PV.W, +; R600-NEXT: LSHL T1.W, KC0[3].Z, literal.x, +; R600-NEXT: LSHR * T2.W, KC0[4].Z, literal.y, +; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44) +; R600-NEXT: OR_INT T0.Y, PV.W, PS, +; R600-NEXT: LSHL T1.W, KC0[3].Y, 1, +; R600-NEXT: LSHR * T2.W, KC0[4].Y, literal.x, ; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_lshr_b32 s2, s11, 31 +; GFX10-NEXT: s_lshl_b32 s3, s7, 1 +; GFX10-NEXT: s_lshr_b32 s7, s10, 23 +; GFX10-NEXT: s_lshl_b32 s6, s6, 9 +; GFX10-NEXT: s_lshr_b32 s9, s9, 25 +; GFX10-NEXT: s_lshl_b32 s5, s5, 7 +; GFX10-NEXT: s_lshr_b32 s8, s8, 31 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s6, s7 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_or_b32 s5, s5, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: @@ -679,12 +951,23 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 +; GFX11-NEXT: s_lshr_b32 s2, s11, 31 +; GFX11-NEXT: s_lshl_b32 s3, s7, 1 +; GFX11-NEXT: s_lshr_b32 s7, s10, 23 +; GFX11-NEXT: s_lshl_b32 s6, s6, 9 +; GFX11-NEXT: s_lshr_b32 s9, s9, 25 +; GFX11-NEXT: s_lshl_b32 s5, s5, 7 +; GFX11-NEXT: s_lshr_b32 s8, s8, 31 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s7 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -27,11 +27,14 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 +; SI-NEXT: s_lshl_b32 s5, s6, 1 +; SI-NEXT: s_not_b32 s6, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_lshr_b32 s4, s7, s8 +; SI-NEXT: s_lshl_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -40,11 +43,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; VI-NEXT: s_lshl_b32 s2, s6, 1 +; VI-NEXT: s_lshr_b32 s1, s7, s0 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshl_b32 s0, s2, s0 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -54,45 +60,63 @@ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 +; GFX9-NEXT: s_lshl_b32 s1, s6, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, s2 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: NOT_INT * T0.W, KC0[3].X, +; R600-NEXT: AND_INT T0.Z, KC0[3].X, literal.x, +; R600-NEXT: AND_INT T0.W, PV.W, literal.x, +; R600-NEXT: LSHL * T1.W, KC0[2].Z, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T0.W, PS, PV.W, +; R600-NEXT: LSHR * T1.W, KC0[2].W, PV.Z, +; R600-NEXT: OR_INT T0.X, PV.W, PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, ; ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_lshl_b32 s0, s6, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s7, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_lshl_b32 s1, s6, 1 +; GFX11-NEXT: s_not_b32 s2, s0 +; GFX11-NEXT: s_lshr_b32 s0, s7, s0 +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -108,10 +132,12 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: s_lshr_b32 s0, s3, 7 +; SI-NEXT: s_lshl_b32 s1, s2, 25 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -119,10 +145,12 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: s_lshr_b32 s3, s3, 7 +; VI-NEXT: s_lshl_b32 s2, s2, 25 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -131,38 +159,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: s_lshr_b32 s3, s3, 7 +; GFX9-NEXT: s_lshl_b32 s2, s2, 25 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: LSHL T0.W, KC0[2].Z, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[2].W, literal.y, +; R600-NEXT: 25(3.503246e-44), 7(9.809089e-45) +; R600-NEXT: OR_INT T0.X, PV.W, PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 7 +; GFX10-NEXT: s_lshl_b32 s2, s2, 25 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 7 +; GFX11-NEXT: s_lshl_b32 s2, s2, 25 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -181,12 +219,18 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 1 +; SI-NEXT: s_lshr_b32 s7, s7, s9 +; SI-NEXT: s_not_b32 s9, s9 +; SI-NEXT: s_lshl_b32 s5, s5, s9 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_lshl_b32 s4, s4, 1 +; SI-NEXT: s_not_b32 s7, s8 +; SI-NEXT: s_lshr_b32 s6, s6, s8 +; SI-NEXT: s_lshl_b32 s4, s4, s7 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -196,13 +240,19 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 +; VI-NEXT: s_lshl_b32 s5, s5, 1 +; VI-NEXT: s_lshr_b32 s7, s7, s3 +; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: s_lshl_b32 s3, s5, s3 +; VI-NEXT: s_lshr_b32 s5, s6, s2 +; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_lshl_b32 s2, s4, s2 +; VI-NEXT: s_or_b32 s3, s3, s7 +; VI-NEXT: s_or_b32 s2, s2, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -214,57 +264,91 @@ ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 +; GFX9-NEXT: s_lshl_b32 s1, s5, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, s3 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s1, s6, s2 +; GFX9-NEXT: s_lshl_b32 s3, s4, 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MOV * T0.W, KC0[4].X, -; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, -; R600-NEXT: MOV * T0.W, KC0[3].W, -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, +; R600-NEXT: NOT_INT * T0.W, KC0[4].X, +; R600-NEXT: AND_INT T0.Y, KC0[4].X, literal.x, +; R600-NEXT: AND_INT T0.Z, PV.W, literal.x, +; R600-NEXT: LSHL * T0.W, KC0[3].X, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: NOT_INT * T1.W, KC0[3].W, +; R600-NEXT: AND_INT T0.X, KC0[3].W, literal.x, +; R600-NEXT: AND_INT T1.Y, PV.W, literal.x, +; R600-NEXT: LSHL T1.Z, KC0[2].W, 1, +; R600-NEXT: LSHL T0.W, T0.W, T0.Z, +; R600-NEXT: LSHR * T1.W, KC0[3].Z, T0.Y, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T0.Y, PV.W, PS, +; R600-NEXT: LSHL T0.W, PV.Z, PV.Y, +; R600-NEXT: LSHR * T1.W, KC0[3].Y, PV.X, +; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_lshl_b32 s1, s5, 1 +; GFX10-NEXT: s_lshr_b32 s0, s7, s3 +; GFX10-NEXT: s_not_b32 s3, s3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_not_b32 s5, s2 +; GFX10-NEXT: s_lshr_b32 s2, s6, s2 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_lshr_b32 s7, s7, s3 +; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_not_b32 s8, s2 +; GFX11-NEXT: s_lshr_b32 s2, s6, s2 +; GFX11-NEXT: s_lshl_b32 s4, s4, s8 +; GFX11-NEXT: s_lshl_b32 s3, s5, s3 +; GFX11-NEXT: s_or_b32 s2, s4, s2 +; GFX11-NEXT: s_or_b32 s3, s3, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -281,10 +365,14 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s4, v2, 7 +; SI-NEXT: s_lshr_b32 s7, s7, 9 +; SI-NEXT: s_lshl_b32 s5, s5, 23 +; SI-NEXT: s_lshr_b32 s6, s6, 7 +; SI-NEXT: s_lshl_b32 s4, s4, 25 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -293,11 +381,15 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 +; VI-NEXT: s_lshr_b32 s2, s7, 9 +; VI-NEXT: s_lshl_b32 s3, s5, 23 +; VI-NEXT: s_lshr_b32 s5, s6, 7 +; VI-NEXT: s_lshl_b32 s4, s4, 25 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s4, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -308,24 +400,32 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 9 +; GFX9-NEXT: s_lshl_b32 s1, s5, 23 +; GFX9-NEXT: s_lshr_b32 s5, s6, 7 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_lshl_b32 s1, s4, 25 +; GFX9-NEXT: s_or_b32 s1, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: LSHL T0.W, KC0[3].X, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[3].Z, literal.y, +; R600-NEXT: 23(3.222986e-44), 9(1.261169e-44) +; R600-NEXT: OR_INT T0.Y, PV.W, PS, +; R600-NEXT: LSHL T0.W, KC0[2].W, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[3].Y, literal.y, +; R600-NEXT: 25(3.503246e-44), 7(9.809089e-45) +; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -336,8 +436,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 +; GFX10-NEXT: s_lshr_b32 s0, s7, 9 +; GFX10-NEXT: s_lshr_b32 s1, s6, 7 +; GFX10-NEXT: s_lshl_b32 s4, s4, 25 +; GFX10-NEXT: s_lshl_b32 s5, s5, 23 +; GFX10-NEXT: s_or_b32 s1, s4, s1 +; GFX10-NEXT: s_or_b32 s0, s5, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -346,10 +452,16 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 +; GFX11-NEXT: s_lshr_b32 s2, s7, 9 +; GFX11-NEXT: s_lshr_b32 s3, s6, 7 +; GFX11-NEXT: s_lshl_b32 s4, s4, 25 +; GFX11-NEXT: s_lshl_b32 s5, s5, 23 +; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -368,18 +480,30 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: s_lshl_b32 s7, s7, 1 +; SI-NEXT: s_lshr_b32 s11, s11, s15 +; SI-NEXT: s_not_b32 s15, s15 +; SI-NEXT: s_lshl_b32 s7, s7, s15 +; SI-NEXT: s_or_b32 s7, s7, s11 +; SI-NEXT: s_lshl_b32 s6, s6, 1 +; SI-NEXT: s_not_b32 s11, s14 +; SI-NEXT: s_lshr_b32 s10, s10, s14 +; SI-NEXT: s_lshl_b32 s6, s6, s11 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_lshl_b32 s5, s5, 1 +; SI-NEXT: s_not_b32 s10, s13 +; SI-NEXT: s_lshr_b32 s9, s9, s13 +; SI-NEXT: s_lshl_b32 s5, s5, s10 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_lshl_b32 s4, s4, 1 +; SI-NEXT: s_not_b32 s9, s12 +; SI-NEXT: s_lshr_b32 s8, s8, s12 +; SI-NEXT: s_lshl_b32 s4, s4, s9 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -389,19 +513,31 @@ ; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 -; VI-NEXT: v_alignbit_b32 v2, s6, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; VI-NEXT: s_lshl_b32 s3, s7, 1 +; VI-NEXT: s_not_b32 s7, s15 +; VI-NEXT: s_lshr_b32 s2, s11, s15 +; VI-NEXT: s_lshl_b32 s3, s3, s7 +; VI-NEXT: s_lshl_b32 s6, s6, 1 +; VI-NEXT: s_not_b32 s7, s14 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s3, s10, s14 +; VI-NEXT: s_lshl_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s5, s5, 1 +; VI-NEXT: s_not_b32 s7, s13 +; VI-NEXT: s_or_b32 s3, s6, s3 +; VI-NEXT: s_lshr_b32 s6, s9, s13 +; VI-NEXT: s_lshl_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_not_b32 s7, s12 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_lshr_b32 s6, s8, s12 +; VI-NEXT: s_lshl_b32 s4, s4, s7 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -413,75 +549,144 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 +; GFX9-NEXT: s_lshl_b32 s3, s7, 1 +; GFX9-NEXT: s_not_b32 s7, s15 +; GFX9-NEXT: s_lshr_b32 s2, s11, s15 +; GFX9-NEXT: s_lshl_b32 s3, s3, s7 +; GFX9-NEXT: s_lshl_b32 s6, s6, 1 +; GFX9-NEXT: s_not_b32 s7, s14 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s10, s14 +; GFX9-NEXT: s_lshl_b32 s6, s6, s7 +; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_not_b32 s7, s13 +; GFX9-NEXT: s_or_b32 s3, s6, s3 +; GFX9-NEXT: s_lshr_b32 s6, s9, s13 +; GFX9-NEXT: s_lshl_b32 s5, s5, s7 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_not_b32 s7, s12 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s8, s12 +; GFX9-NEXT: s_lshl_b32 s4, s4, s7 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: ALU 35, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: MOV * T0.W, KC0[6].X, -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, -; R600-NEXT: MOV * T1.W, KC0[5].W, -; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, -; R600-NEXT: MOV * T1.W, KC0[5].Z, -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, -; R600-NEXT: MOV * T1.W, KC0[5].Y, -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T0.Z, KC0[3].Z, 1, +; R600-NEXT: NOT_INT T1.W, KC0[6].X, +; R600-NEXT: AND_INT * T2.W, KC0[6].X, literal.x, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T0.X, KC0[5].Z, literal.x, +; R600-NEXT: LSHR T0.Y, KC0[5].X, PS, +; R600-NEXT: NOT_INT T1.Z, KC0[5].W, +; R600-NEXT: AND_INT * T1.W, PV.W, literal.x, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: LSHL * T2.W, KC0[4].X, 1, +; R600-NEXT: LSHL T1.X, PV.W, T1.W, +; R600-NEXT: AND_INT T1.Y, KC0[5].W, literal.x, +; R600-NEXT: AND_INT T1.Z, T1.Z, literal.x, +; R600-NEXT: LSHL T1.W, KC0[3].W, 1, +; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T2.X, PS, literal.x, +; R600-NEXT: LSHL T2.Y, PV.W, PV.Z, +; R600-NEXT: LSHR T1.Z, KC0[4].W, PV.Y, +; R600-NEXT: NOT_INT T1.W, KC0[5].Y, +; R600-NEXT: OR_INT * T2.W, PV.X, T0.Y, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T1.X, PV.W, literal.x, +; R600-NEXT: LSHL T0.Y, KC0[3].Y, 1, +; R600-NEXT: OR_INT T2.Z, PV.Y, PV.Z, +; R600-NEXT: LSHL T1.W, T0.Z, PV.X, +; R600-NEXT: LSHR * T3.W, KC0[4].Z, T0.X, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T2.Y, PV.W, PS, +; R600-NEXT: LSHL T1.W, PV.Y, PV.X, +; R600-NEXT: LSHR * T0.W, KC0[4].Y, T0.W, +; R600-NEXT: OR_INT T2.X, PV.W, PS, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s15 -; GFX10-NEXT: v_mov_b32_e32 v1, s14 -; GFX10-NEXT: v_mov_b32_e32 v4, s13 -; GFX10-NEXT: v_mov_b32_e32 v5, s12 -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GFX10-NEXT: s_lshl_b32 s3, s7, 1 +; GFX10-NEXT: s_lshr_b32 s2, s11, s15 +; GFX10-NEXT: s_not_b32 s7, s15 +; GFX10-NEXT: s_lshl_b32 s6, s6, 1 +; GFX10-NEXT: s_not_b32 s11, s14 +; GFX10-NEXT: s_lshr_b32 s9, s9, s13 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: s_not_b32 s13, s13 +; GFX10-NEXT: s_lshr_b32 s8, s8, s12 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_not_b32 s12, s12 +; GFX10-NEXT: s_lshr_b32 s10, s10, s14 +; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s6, s6, s11 +; GFX10-NEXT: s_lshl_b32 s5, s5, s13 +; GFX10-NEXT: s_lshl_b32 s4, s4, s12 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s6, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_or_b32 s5, s5, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, v0 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_lshl_b32 s3, s7, 1 +; GFX11-NEXT: s_lshr_b32 s2, s11, s15 +; GFX11-NEXT: s_not_b32 s7, s15 +; GFX11-NEXT: s_lshl_b32 s6, s6, 1 +; GFX11-NEXT: s_not_b32 s11, s14 +; GFX11-NEXT: s_lshr_b32 s9, s9, s13 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_not_b32 s13, s13 +; GFX11-NEXT: s_lshr_b32 s8, s8, s12 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_not_b32 s12, s12 +; GFX11-NEXT: s_lshr_b32 s10, s10, s14 +; GFX11-NEXT: s_lshl_b32 s3, s3, s7 +; GFX11-NEXT: s_lshl_b32 s6, s6, s11 +; GFX11-NEXT: s_lshl_b32 s5, s5, s13 +; GFX11-NEXT: s_lshl_b32 s4, s4, s12 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s10 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -498,14 +703,22 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; SI-NEXT: s_lshr_b32 s11, s11, 1 +; SI-NEXT: s_lshl_b32 s7, s7, 31 +; SI-NEXT: s_lshr_b32 s10, s10, 9 +; SI-NEXT: s_lshl_b32 s6, s6, 23 +; SI-NEXT: s_lshr_b32 s9, s9, 7 +; SI-NEXT: s_lshl_b32 s5, s5, 25 +; SI-NEXT: s_lshr_b32 s8, s8, 1 +; SI-NEXT: s_lshl_b32 s4, s4, 31 +; SI-NEXT: s_or_b32 s7, s7, s11 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -514,15 +727,23 @@ ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_lshr_b32 s2, s11, 1 +; VI-NEXT: s_lshl_b32 s3, s7, 31 +; VI-NEXT: s_lshr_b32 s7, s10, 9 +; VI-NEXT: s_lshl_b32 s6, s6, 23 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s6, s7 +; VI-NEXT: s_lshr_b32 s6, s9, 7 +; VI-NEXT: s_lshl_b32 s5, s5, 25 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_lshr_b32 s6, s8, 1 +; VI-NEXT: s_lshl_b32 s4, s4, 31 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -533,45 +754,74 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: s_lshr_b32 s2, s11, 1 +; GFX9-NEXT: s_lshl_b32 s3, s7, 31 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s3, s6, 23 +; GFX9-NEXT: s_lshr_b32 s6, s9, 7 +; GFX9-NEXT: s_lshl_b32 s5, s5, 25 +; GFX9-NEXT: s_lshr_b32 s7, s10, 9 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s8, 1 +; GFX9-NEXT: s_lshl_b32 s4, s4, 31 +; GFX9-NEXT: s_or_b32 s3, s3, s7 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32_imm: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, -; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, +; R600-NEXT: LSHL T0.W, KC0[4].X, literal.x, +; R600-NEXT: LSHR * T1.W, KC0[5].X, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T0.Z, KC0[3].W, literal.x, +; R600-NEXT: LSHR T2.W, KC0[4].W, literal.y, +; R600-NEXT: OR_INT * T0.W, PV.W, PS, +; R600-NEXT: 23(3.222986e-44), 9(1.261169e-44) +; R600-NEXT: OR_INT T0.Z, PV.Z, PV.W, +; R600-NEXT: LSHL T1.W, KC0[3].Z, literal.x, +; R600-NEXT: LSHR * T2.W, KC0[4].Z, literal.y, +; R600-NEXT: 25(3.503246e-44), 7(9.809089e-45) +; R600-NEXT: OR_INT T0.Y, PV.W, PS, +; R600-NEXT: LSHL T1.W, KC0[3].Y, literal.x, +; R600-NEXT: LSHR * T2.W, KC0[4].Y, 1, +; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_lshr_b32 s2, s11, 1 +; GFX10-NEXT: s_lshl_b32 s3, s7, 31 +; GFX10-NEXT: s_lshr_b32 s7, s10, 9 +; GFX10-NEXT: s_lshl_b32 s6, s6, 23 +; GFX10-NEXT: s_lshr_b32 s9, s9, 7 +; GFX10-NEXT: s_lshl_b32 s5, s5, 25 +; GFX10-NEXT: s_lshr_b32 s8, s8, 1 +; GFX10-NEXT: s_lshl_b32 s4, s4, 31 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s6, s7 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_or_b32 s5, s5, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: @@ -579,12 +829,23 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX11-NEXT: s_lshr_b32 s2, s11, 1 +; GFX11-NEXT: s_lshl_b32 s3, s7, 31 +; GFX11-NEXT: s_lshr_b32 s7, s10, 9 +; GFX11-NEXT: s_lshl_b32 s6, s6, 23 +; GFX11-NEXT: s_lshr_b32 s9, s9, 7 +; GFX11-NEXT: s_lshl_b32 s5, s5, 25 +; GFX11-NEXT: s_lshr_b32 s8, s8, 1 +; GFX11-NEXT: s_lshl_b32 s4, s4, 31 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s7 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -222,11 +222,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -237,11 +238,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -291,13 +293,13 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s1, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -310,13 +312,13 @@ ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s1, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; use s0 @@ -379,13 +381,14 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s2, s0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -401,19 +404,20 @@ ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s1, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s2, s2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -8068,19 +8068,21 @@ ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s2, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s6 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -8093,14 +8095,16 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s2, 8 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 -; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: s_or_b32 s1, s2, s3 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff00ff ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8112,14 +8116,15 @@ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s0, v3, 16 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -8310,26 +8315,30 @@ ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s7, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s8, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s5, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s4, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s8, s2 +; GFX6-NOHSA-NEXT: s_or_b32 s2, s5, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s10 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -8338,27 +8347,31 @@ ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s3, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s2, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 16 +; GFX7-HSA-NEXT: s_or_b32 s1, s4, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s3, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s2, 8 +; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff0000 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_or_b32 s3, s3, s6 +; GFX7-HSA-NEXT: s_or_b32 s2, s2, s4 +; GFX7-HSA-NEXT: s_or_b32 s0, s5, s0 +; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8370,22 +8383,23 @@ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s4, s1 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s5, s4 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s3, 0xff +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -8683,43 +8697,51 @@ ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s11, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s13, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s5, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s4, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s17, s6, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s6, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s11, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s12, s11 +; GFX6-NOHSA-NEXT: s_or_b32 s10, s13, s10 ; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s15 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s16, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s8, s17, s8 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s18 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s19 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: @@ -8728,48 +8750,56 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s5, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX7-HSA-NEXT: s_or_b32 s9, s10, s9 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s4, 8 +; GFX7-HSA-NEXT: s_and_b32 s3, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff0000 ; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 +; GFX7-HSA-NEXT: s_and_b32 s10, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-HSA-NEXT: s_or_b32 s3, s10, s3 +; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-HSA-NEXT: s_or_b32 s2, s10, s2 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s7, 8 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s5, s5, 8 +; GFX7-HSA-NEXT: s_or_b32 s7, s7, s10 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s6, 8 +; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 ; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 -; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 -; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_or_b32 s6, s6, s10 +; GFX7-HSA-NEXT: s_or_b32 s8, s11, s8 +; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -8780,48 +8810,50 @@ ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s5, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s4, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NOHSA-NEXT: s_or_b32 s10, s3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s9, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s8, s5 +; GFX8-NOHSA-NEXT: s_or_b32 s8, s3, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s7, 8 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s6, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s6, 0x80010 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s6 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s7, s4 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s6, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -9346,81 +9378,97 @@ ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s15, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s17, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s19, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s1, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s21, s0, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s1, s1, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s3, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s25, s2, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s3, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s29, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s5, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s4, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s33, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s34, s6, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s6, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s19, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s15, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s13, s13, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX6-NOHSA-NEXT: s_or_b32 s19, s20, s19 +; GFX6-NOHSA-NEXT: s_or_b32 s18, s21, s18 +; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s22 +; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s23 +; GFX6-NOHSA-NEXT: s_or_b32 s17, s24, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s16, s25, s16 +; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s26 +; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s27 +; GFX6-NOHSA-NEXT: s_or_b32 s15, s28, s15 +; GFX6-NOHSA-NEXT: s_or_b32 s14, s29, s14 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s30 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s31 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s33, s13 +; GFX6-NOHSA-NEXT: s_or_b32 s12, s34, s12 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s35 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s36 +; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: @@ -9429,94 +9477,110 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 -; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 +; GFX7-HSA-NEXT: s_and_b32 s17, s1, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s16, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s18, s1, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s17, s17, 8 +; GFX7-HSA-NEXT: s_or_b32 s17, s18, s17 +; GFX7-HSA-NEXT: s_and_b32 s18, s0, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX7-HSA-NEXT: s_or_b32 s16, s18, s16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s1, 8 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s1, s1, s18 +; GFX7-HSA-NEXT: s_and_b32 s18, s1, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b32 s1, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s0, 8 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_and_b32 s15, s3, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 +; GFX7-HSA-NEXT: s_and_b32 s14, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s19, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 -; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_or_b32 s15, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_or_b32 s14, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s3, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-HSA-NEXT: s_and_b32 s3, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: s_and_b32 s12, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 -; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s13, 8 +; GFX7-HSA-NEXT: s_or_b32 s13, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 +; GFX7-HSA-NEXT: s_or_b32 s12, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s5, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s4, 8 +; GFX7-HSA-NEXT: s_and_b32 s5, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b32 s0, s4, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s11, 8 ; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-HSA-NEXT: s_or_b32 s1, s1, s10 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s7, 8 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s7, s7, s10 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s6, 8 +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s6, s6, s10 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -9527,94 +9591,99 @@ ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 -; GFX8-NOHSA-NEXT: s_and_b32 s15, s1, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s10, s11, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s0, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s15, s15, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s12, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 ; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s14, s1, s0 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NOHSA-NEXT: s_or_b32 s2, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s4, 24 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s4 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s15, s1, s0 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s0, v4 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 ; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 ; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s6, 0xff -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s6, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s6, 0x80010 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s6 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s7, s4 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v7, s10, v7, 16 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s4, v6 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_alignbit_b32 v5, s11, v5, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s6, v3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s12, v3, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[3:6] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -17,7 +17,8 @@ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -38,8 +39,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -64,7 +65,8 @@ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -85,8 +87,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -786,7 +786,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -795,13 +795,19 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT T1.Y, T0.Z, literal.x, -; EG-NEXT: LSHR T1.Z, T0.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, +; EG-NEXT: LSHL T1.Z, T0.Y, literal.x, +; EG-NEXT: LSHR T0.W, T0.X, 1, ; EG-NEXT: NOT_INT * T1.W, T0.Z, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.W, T0.X, PV.Y, +; EG-NEXT: AND_INT T0.Y, T0.Y, literal.x, +; EG-NEXT: AND_INT T2.Z, T0.Z, literal.y, +; EG-NEXT: AND_INT T1.W, PS, literal.y, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: LSHR T0.W, PS, PV.W, +; EG-NEXT: LSHL * T1.W, PV.Y, PV.Z, +; EG-NEXT: OR_INT T1.Z, PS, PV.W, +; EG-NEXT: LSHL T0.W, T0.X, T2.Z, ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, @@ -858,37 +864,50 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 22, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 +; EG-NEXT: ALU 35, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 ; EG-NEXT: ALU clause starting at 10: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, -; EG-NEXT: LSHR T2.Z, T0.W, 1, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, -; EG-NEXT: NOT_INT * T1.W, T1.Z, +; EG-NEXT: AND_INT * T0.W, T1.Y, literal.x, +; EG-NEXT: -2(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.X, T1.W, literal.x, +; EG-NEXT: AND_INT T0.Y, T0.Z, literal.y, +; EG-NEXT: LSHL T2.Z, T1.W, literal.y, +; EG-NEXT: LSHR T1.W, T1.Z, 1, BS:VEC_120/SCL_212 +; EG-NEXT: NOT_INT * T2.W, T0.Z, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: AND_INT T3.X, PS, literal.x, +; EG-NEXT: OR_INT T2.Y, PV.Z, PV.W, +; EG-NEXT: LSHL T2.Z, T1.Y, literal.x, +; EG-NEXT: LSHR T1.W, T1.X, 1, +; EG-NEXT: NOT_INT * T2.W, T0.X, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS, -; EG-NEXT: LSHL * T1.W, T0.Z, PV.Y, -; EG-NEXT: AND_INT T2.X, T1.Z, literal.x, -; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, -; EG-NEXT: LSHR T0.Z, T0.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, -; EG-NEXT: NOT_INT * T3.W, T1.X, -; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, -; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT * T3.W, PV.X, T0.W, T1.W, +; EG-NEXT: AND_INT T4.X, T0.X, literal.x, +; EG-NEXT: AND_INT T1.Y, PS, literal.x, +; EG-NEXT: OR_INT T2.Z, PV.Z, PV.W, +; EG-NEXT: LSHR T1.W, PV.Y, PV.X, +; EG-NEXT: LSHL * T2.W, T2.X, T0.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T2.X, PS, PV.W, +; EG-NEXT: LSHL T0.Y, T1.Z, T0.Y, +; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T1.W, PV.Z, PV.Y, +; EG-NEXT: LSHL * T0.W, T0.W, PV.X, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T1.Y, PS, PV.W, +; EG-NEXT: LSHL T1.Z, T1.X, T4.X, +; EG-NEXT: AND_INT T0.W, T0.X, literal.x, BS:VEC_201 +; EG-NEXT: CNDE_INT * T2.W, PV.Z, PV.X, PV.Y, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z, -; EG-NEXT: CNDE_INT * T3.Z, T2.X, T1.W, 0.0, -; EG-NEXT: CNDE_INT T3.X, T2.W, T0.Z, 0.0, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT * T2.Z, T0.Z, T0.Y, 0.0, +; EG-NEXT: CNDE_INT T2.X, T0.W, T1.Z, 0.0, ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 @@ -955,63 +974,88 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 47, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 +; EG-NEXT: ALU 72, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 -; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 -; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 32, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 +; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x, -; EG-NEXT: LSHR T1.W, T0.W, 1, -; EG-NEXT: NOT_INT * T3.W, T1.Z, +; EG-NEXT: LSHL T4.Z, T1.W, literal.x, +; EG-NEXT: LSHR T0.W, T1.Z, 1, +; EG-NEXT: NOT_INT * T2.W, T0.Z, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1, -; EG-NEXT: AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201 -; EG-NEXT: LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221 -; EG-NEXT: NOT_INT * T2.W, T3.Z, +; EG-NEXT: AND_INT T0.Y, T1.W, literal.x, +; EG-NEXT: AND_INT T5.Z, T0.Z, literal.y, +; EG-NEXT: AND_INT T1.W, PS, literal.y, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: LSHL T4.X, T3.W, literal.x, +; EG-NEXT: LSHR T2.Y, T3.Z, 1, +; EG-NEXT: NOT_INT T4.Z, T2.Z, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T0.W, PS, PV.W, +; EG-NEXT: LSHL * T1.W, PV.Y, PV.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T5.X, PS, PV.W, +; EG-NEXT: AND_INT T0.Y, T3.W, literal.x, +; EG-NEXT: AND_INT T6.Z, T2.Z, literal.y, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.y, +; EG-NEXT: OR_INT * T1.W, PV.X, PV.Y, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: AND_INT T4.X, T1.Y, literal.x, +; EG-NEXT: AND_INT T2.Y, T0.X, literal.y, +; EG-NEXT: LSHR T4.Z, PS, PV.W, +; EG-NEXT: LSHL T0.W, PV.Y, PV.Z, +; EG-NEXT: NOT_INT * T1.W, T0.X, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: AND_INT T6.X, PS, literal.x, +; EG-NEXT: OR_INT T0.Y, PV.W, PV.Z, +; EG-NEXT: LSHL T4.Z, T3.Y, literal.x, +; EG-NEXT: LSHR T0.W, T3.X, 1, +; EG-NEXT: NOT_INT * T1.W, T2.X, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T2.Z, T2.Z, PV.Y, -; EG-NEXT: BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W, -; EG-NEXT: LSHL * T1.W, T0.Z, T4.Z, -; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, -; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, -; EG-NEXT: LSHR T0.Z, T0.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, -; EG-NEXT: NOT_INT * T3.W, T1.X, +; EG-NEXT: LSHL T7.X, T3.Z, T6.Z, +; EG-NEXT: AND_INT T3.Y, T3.Y, literal.x, +; EG-NEXT: AND_INT T3.Z, T2.X, literal.y, +; EG-NEXT: AND_INT T1.W, PS, literal.y, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: -2(nan), 31(4.344025e-44) +; EG-NEXT: AND_INT T8.X, T2.Z, literal.x, +; EG-NEXT: LSHR T4.Y, PS, PV.W, +; EG-NEXT: LSHL T2.Z, PV.Y, PV.Z, +; EG-NEXT: LSHL T0.W, T1.Y, literal.y, +; EG-NEXT: LSHR * T1.W, T1.X, 1, ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: AND_INT T5.X, T3.Z, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, -; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T0.W, T1.W, +; EG-NEXT: OR_INT T9.X, PV.W, PS, +; EG-NEXT: OR_INT T1.Y, PV.Z, PV.Y, +; EG-NEXT: LSHL T2.Z, T3.X, T3.Z, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T0.W, T2.X, literal.x, BS:VEC_201 +; EG-NEXT: CNDE_INT * T1.W, PV.X, T0.Y, T7.X, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, T3.X, literal.x, -; EG-NEXT: CNDE_INT T4.Y, PV.W, PV.Y, PV.Z, -; EG-NEXT: LSHR T1.Z, T2.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1, -; EG-NEXT: NOT_INT * T3.W, T3.X, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Y, T2.X, PV.X, -; EG-NEXT: CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212 -; EG-NEXT: AND_INT * T0.W, T3.X, literal.x, BS:VEC_201 +; EG-NEXT: LSHL T2.X, T1.Z, T5.Z, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, +; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x, BS:VEC_201 +; EG-NEXT: LSHR T2.W, PV.X, T6.X, +; EG-NEXT: LSHL * T3.W, T4.X, T2.Y, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T3.X, PS, PV.W, +; EG-NEXT: LSHL T0.Y, T1.X, T2.Y, +; EG-NEXT: CNDE_INT * T1.Z, T8.X, T7.X, 0.0, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T2.W, T0.X, literal.x, BS:VEC_201 +; EG-NEXT: CNDE_INT * T3.W, T0.Z, T5.X, T2.X, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T1.W, T5.X, T3.Y, T2.Z, -; EG-NEXT: CNDE_INT T4.X, T2.W, T0.Z, 0.0, -; EG-NEXT: CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212 -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, +; EG-NEXT: CNDE_INT T1.X, T0.W, T2.Z, 0.0, +; EG-NEXT: CNDE_INT T3.Y, PV.W, T3.X, T0.Y, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T0.X, PV.W, literal.x, -; EG-NEXT: CNDE_INT T1.Z, T5.X, T2.Z, 0.0, -; EG-NEXT: CNDE_INT * T1.X, T0.W, T0.Y, 0.0, +; EG-NEXT: CNDE_INT T3.Z, T0.Z, T2.X, 0.0, +; EG-NEXT: CNDE_INT * T3.X, T2.W, T0.Y, 0.0, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -1167,20 +1211,23 @@ ; ; EG-LABEL: s_shl_constant_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.W, literal.y, -; EG-NEXT: NOT_INT * T1.W, KC0[2].W, -; EG-NEXT: 31(4.344025e-44), -1(nan) -; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.Z, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, -; EG-NEXT: 32767(4.591635e-41), -1(nan) -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T0.W, literal.x, PV.W, +; EG-NEXT: LSHL * T2.W, literal.y, T1.W, +; EG-NEXT: -1(nan), 65534(9.183269e-41) +; EG-NEXT: OR_INT T0.Z, PS, PV.W, +; EG-NEXT: LSHL T0.W, literal.x, T1.W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, +; EG-NEXT: -1(nan), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1233,7 +1280,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1242,17 +1289,20 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: NOT_INT T0.Z, T0.X, -; EG-NEXT: MOV T0.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, -; EG-NEXT: 1435293955(1.935796e+13), 31(4.344025e-44) -; EG-NEXT: LSHL T1.Z, literal.x, PS, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, -; EG-NEXT: -1424379385(-5.460358e-13), 143(2.003857e-43) -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, -; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, +; EG-NEXT: NOT_INT T0.W, T0.X, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T0.W, literal.x, PV.W, +; EG-NEXT: LSHL * T2.W, literal.y, T1.W, +; EG-NEXT: 1435293955(1.935796e+13), 286(4.007714e-43) +; EG-NEXT: OR_INT T0.Z, PS, PV.W, +; EG-NEXT: AND_INT T0.W, T0.X, literal.x, +; EG-NEXT: LSHL * T1.W, literal.y, T1.W, +; EG-NEXT: 32(4.484155e-44), -1424379385(-5.460358e-13) +; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, +; EG-NEXT: CNDE_INT T0.X, T0.W, T1.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load i64, ptr addrspace(1) %aptr, align 8 @@ -1302,7 +1352,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1311,11 +1361,13 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT T0.W, T0.X, literal.x, -; EG-NEXT: NOT_INT * T1.W, T0.X, +; EG-NEXT: NOT_INT T0.W, T0.X, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.W, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T0.Z, literal.x, PV.W, +; EG-NEXT: LSHL T0.W, literal.y, T1.W, ; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, ; EG-NEXT: 617283(8.649977e-40), 1234567(1.729997e-39) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) @@ -1366,7 +1418,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1375,11 +1427,13 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT T0.W, T0.X, literal.x, -; EG-NEXT: NOT_INT * T1.W, T0.X, +; EG-NEXT: NOT_INT T0.W, T0.X, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.W, +; EG-NEXT: LSHR T0.Z, literal.x, PV.W, +; EG-NEXT: LSHL T0.W, literal.y, T1.W, ; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, @@ -1421,16 +1475,17 @@ ; ; EG-LABEL: s_shl_inline_imm_64_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: NOT_INT * T0.W, KC0[2].W, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHL T0.Z, literal.x, PS, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, +; EG-NEXT: LSHR T0.W, literal.y, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, ; EG-NEXT: 64(8.968310e-44), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, @@ -1522,15 +1577,16 @@ ; ; EG-LABEL: s_shl_inline_imm_1_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 536346624(1.050321e-19), 32(4.484155e-44) +; EG-NEXT: 1072693248(1.875000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1569,15 +1625,16 @@ ; ; EG-LABEL: s_shl_inline_imm_neg_1_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 1610088448(3.574057e+19), 32(4.484155e-44) +; EG-NEXT: -1074790400(-1.875000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1616,15 +1673,16 @@ ; ; EG-LABEL: s_shl_inline_imm_0_5_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 535822336(1.016440e-19), 32(4.484155e-44) +; EG-NEXT: 1071644672(1.750000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1663,15 +1721,16 @@ ; ; EG-LABEL: s_shl_inline_imm_neg_0_5_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 1609564160(3.458765e+19), 32(4.484155e-44) +; EG-NEXT: -1075838976(-1.750000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1710,15 +1769,16 @@ ; ; EG-LABEL: s_shl_inline_imm_2_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 536870912(1.084202e-19), 32(4.484155e-44) +; EG-NEXT: 1073741824(2.000000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1757,15 +1817,16 @@ ; ; EG-LABEL: s_shl_inline_imm_neg_2_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 1610612736(3.689349e+19), 32(4.484155e-44) +; EG-NEXT: -1073741824(-2.000000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1804,15 +1865,16 @@ ; ; EG-LABEL: s_shl_inline_imm_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 537395200(1.151965e-19), 32(4.484155e-44) +; EG-NEXT: 1074790400(2.250000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1851,15 +1913,16 @@ ; ; EG-LABEL: s_shl_inline_imm_neg_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 1611137024(3.919933e+19), 32(4.484155e-44) +; EG-NEXT: -1072693248(-2.250000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1903,16 +1966,17 @@ ; ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: NOT_INT * T0.W, KC0[2].W, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHL T0.Z, literal.x, PS, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, +; EG-NEXT: LSHR T0.W, literal.y, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, ; EG-NEXT: 1082130432(4.000000e+00), 541065216(1.626303e-19) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) @@ -1959,20 +2023,23 @@ ; ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.W, literal.y, -; EG-NEXT: NOT_INT * T1.W, KC0[2].W, -; EG-NEXT: 31(4.344025e-44), -532676608(-5.534023e+19) -; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.Z, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, -; EG-NEXT: 2147483647(nan), -1065353216(-4.000000e+00) -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T0.W, literal.x, PV.W, +; EG-NEXT: LSHL * T2.W, literal.y, T1.W, +; EG-NEXT: -532676608(-5.534023e+19), -2(nan) +; EG-NEXT: OR_INT T0.Z, PS, PV.W, +; EG-NEXT: LSHL T0.W, literal.x, T1.W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, +; EG-NEXT: -1065353216(-4.000000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -2015,15 +2082,16 @@ ; ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44) +; EG-NEXT: 1082130432(4.000000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -2066,15 +2134,16 @@ ; ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT * T0.W, KC0[2].W, -; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, +; EG-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T0.W, literal.x, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 1614807040(5.534023e+19), 32(4.484155e-44) +; EG-NEXT: -1065353216(-4.000000e+00), 32(4.484155e-44) ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, ; EG-NEXT: MOV T0.X, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -423,16 +423,19 @@ ; ; EG-LABEL: s_ashr_i64: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) +; EG-NEXT: ASHR T0.Y, KC0[2].Z, literal.x, +; EG-NEXT: LSHR * T0.W, KC0[2].Z, literal.y, +; EG-NEXT: 31(4.344025e-44), 8(1.121039e-44) +; EG-NEXT: LSHL * T1.W, PV.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, PV.W, T0.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %in.ext = sext i32 %in to i64 %ashr = ashr i64 %in.ext, 8 @@ -481,7 +484,7 @@ ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -490,17 +493,22 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, +; EG-NEXT: NOT_INT * T0.W, T0.Z, +; EG-NEXT: AND_INT T1.Z, T0.Z, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: LSHL * T1.W, T0.Y, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T1.Z, T0.Y, PV.W, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, +; EG-NEXT: LSHL T0.W, PS, PV.W, +; EG-NEXT: LSHR * T1.W, T0.X, PV.Z, +; EG-NEXT: OR_INT T2.Z, PV.W, PS, +; EG-NEXT: ASHR T0.W, T0.Y, T1.Z, ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, -; EG-NEXT: ASHR T0.W, T0.Y, literal.x, +; EG-NEXT: CNDE_INT T0.X, PS, PV.Z, PV.W, +; EG-NEXT: ASHR T2.W, T0.Y, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) -; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W, +; EG-NEXT: CNDE_INT * T0.Y, T1.W, T0.W, PV.W, entry: %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 %a = load i64, ptr addrspace(1) %in @@ -555,8 +563,8 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: ALU 30, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -565,25 +573,36 @@ ; EG-NEXT: ALU clause starting at 10: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, +; EG-NEXT: NOT_INT * T1.W, T1.Z, +; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, +; EG-NEXT: AND_INT T2.Z, PV.W, literal.x, +; EG-NEXT: LSHL T1.W, T0.W, 1, +; EG-NEXT: NOT_INT * T2.W, T1.X, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T1.Y, T0.W, PV.W, -; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x, -; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z, -; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, -; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: ASHR T2.Y, T0.Y, PS, -; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y, -; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X, -; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, +; EG-NEXT: AND_INT T2.X, T1.X, literal.x, +; EG-NEXT: AND_INT T2.Y, PS, literal.x, +; EG-NEXT: LSHL T3.Z, T0.Y, 1, +; EG-NEXT: LSHL T1.W, PV.W, PV.Z, +; EG-NEXT: LSHR * T2.W, T0.Z, PV.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T3.X, PV.W, PS, +; EG-NEXT: ASHR T1.Y, T0.W, T1.Y, +; EG-NEXT: AND_INT T0.Z, T1.Z, literal.x, +; EG-NEXT: LSHL T1.W, PV.Z, PV.Y, +; EG-NEXT: LSHR * T2.W, T0.X, PV.X, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, +; EG-NEXT: OR_INT T2.Y, PV.W, PS, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.X, PV.Y, +; EG-NEXT: ASHR T2.W, T0.Y, T2.X, +; EG-NEXT: AND_INT * T3.W, T1.X, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T1.X, PS, PV.Y, PV.W, ; EG-NEXT: ASHR T0.W, T0.W, literal.x, -; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, +; EG-NEXT: ASHR * T4.W, T0.Y, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W, -; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W, +; EG-NEXT: CNDE_INT * T1.W, T0.Z, T1.Y, PV.W, +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: CNDE_INT * T1.Y, T3.W, T2.W, T4.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 %a = load <2 x i64>, ptr addrspace(1) %in @@ -651,57 +670,79 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: ALU 61, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 -; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, +; EG-NEXT: NOT_INT * T1.W, T1.Z, +; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHL * T2.W, T0.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T1.Y, T0.W, literal.x, -; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212 -; EG-NEXT: AND_INT T1.W, T1.Z, literal.y, -; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x, -; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) -; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z, -; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212 -; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z, -; EG-NEXT: AND_INT * T2.W, T2.X, literal.x, +; EG-NEXT: LSHL T1.W, PS, PV.W, +; EG-NEXT: LSHR * T2.W, T0.Z, PV.Z, +; EG-NEXT: ASHR T1.Y, T3.W, literal.x, +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: ASHR T1.W, T0.W, T4.Z, BS:VEC_102/SCL_221 +; EG-NEXT: NOT_INT * T2.W, T2.Z, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T1.X, literal.x, -; EG-NEXT: ASHR T4.Y, T0.Y, PS, -; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X, -; EG-NEXT: AND_INT * T2.W, T2.X, literal.y, -; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, -; EG-NEXT: ASHR T5.Y, T3.Y, PV.X, -; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z, -; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221 -; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, +; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, +; EG-NEXT: NOT_INT T2.Y, T1.X, +; EG-NEXT: AND_INT T1.Z, T2.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T2.W, PS, literal.y, +; EG-NEXT: LSHL * T4.W, T3.W, 1, +; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) +; EG-NEXT: NOT_INT T5.X, T2.X, +; EG-NEXT: LSHL T4.Y, PS, PV.W, +; EG-NEXT: LSHR T3.Z, T3.Z, PV.Z, +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T4.W, T0.Y, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T6.X, PS, PV.W, +; EG-NEXT: OR_INT T2.Y, PV.Y, PV.Z, +; EG-NEXT: AND_INT T3.Z, T2.X, literal.x, +; EG-NEXT: AND_INT T2.W, PV.X, literal.x, +; EG-NEXT: LSHL * T4.W, T3.Y, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T5.X, T3.W, T1.Z, +; EG-NEXT: AND_INT T4.Y, T2.Z, literal.x, +; EG-NEXT: LSHL T1.Z, PS, PV.W, +; EG-NEXT: LSHR T2.W, T3.X, PV.Z, +; EG-NEXT: AND_INT * T3.W, T1.X, literal.y, +; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) +; EG-NEXT: LSHR T0.X, T0.X, PS, +; EG-NEXT: OR_INT T5.Y, PV.Z, PV.W, +; EG-NEXT: CNDE_INT T2.Z, PV.Y, T2.Y, PV.X, +; EG-NEXT: ASHR T4.W, T3.Y, T3.Z, +; EG-NEXT: AND_INT * T5.W, T2.X, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.X, PS, PV.Y, PV.W, +; EG-NEXT: OR_INT T2.Y, T6.X, PV.X, +; EG-NEXT: CNDE_INT T0.Z, T4.X, T0.Z, T1.W, BS:VEC_120/SCL_212 +; EG-NEXT: ASHR T3.W, T0.Y, T3.W, +; EG-NEXT: AND_INT * T6.W, T1.X, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, -; EG-NEXT: ASHR T6.Y, T3.W, literal.x, -; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201 -; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y, -; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y, +; EG-NEXT: CNDE_INT T0.X, PS, PV.Y, PV.W, +; EG-NEXT: ASHR T5.Y, T0.W, literal.x, +; EG-NEXT: ASHR T1.Z, T3.Y, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y, +; EG-NEXT: CNDE_INT * T2.W, T4.Y, T5.X, T1.Y, ; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T1.X, PV.W, literal.x, -; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z, -; EG-NEXT: ASHR T3.W, T3.Y, literal.y, -; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y, +; EG-NEXT: CNDE_INT T2.Y, T5.W, T4.W, PV.Z, BS:VEC_021/SCL_122 +; EG-NEXT: ASHR T4.W, T0.Y, literal.y, +; EG-NEXT: CNDE_INT * T0.W, T4.X, T1.W, PV.Y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) ; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x, -; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W, +; EG-NEXT: CNDE_INT * T0.Y, T6.W, T3.W, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 %a = load <4 x i64>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -228,7 +228,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -237,16 +237,21 @@ ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, +; EG-NEXT: NOT_INT * T0.W, T0.Z, +; EG-NEXT: AND_INT T1.Z, T0.Z, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: LSHL * T1.W, T0.Y, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T1.Z, T0.Y, PV.W, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, +; EG-NEXT: LSHL T0.W, PS, PV.W, +; EG-NEXT: LSHR * T1.W, T0.X, PV.Z, +; EG-NEXT: OR_INT T2.Z, PV.W, PS, +; EG-NEXT: LSHR T0.W, T0.Y, T1.Z, ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T0.X, PS, PV.Z, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, T1.W, T0.W, 0.0, %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 %a = load i64, ptr addrspace(1) %in %b = load i64, ptr addrspace(1) %b_ptr @@ -311,9 +316,9 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 34, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 +; EG-NEXT: ALU 55, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 @@ -323,41 +328,62 @@ ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, +; EG-NEXT: NOT_INT * T1.W, T1.Z, +; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHL * T3.W, T0.W, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T4.Z, T0.W, PV.W, -; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, -; EG-NEXT: AND_INT * T3.W, T3.Z, literal.y, +; EG-NEXT: LSHL T1.W, PS, PV.W, +; EG-NEXT: LSHR * T3.W, T0.Z, PV.Z, +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: LSHR T0.W, T0.W, T4.Z, +; EG-NEXT: NOT_INT * T1.W, T3.Z, +; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, +; EG-NEXT: NOT_INT T1.Y, T1.X, +; EG-NEXT: AND_INT T1.Z, T3.Z, literal.y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T1.W, PS, literal.y, +; EG-NEXT: LSHL * T3.W, T2.W, 1, ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, T1.Z, -; EG-NEXT: LSHR T1.Y, T2.W, PS, BS:VEC_120/SCL_212 -; EG-NEXT: AND_INT * T0.Z, T3.Z, literal.x, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, T3.Z, -; EG-NEXT: AND_INT * T2.W, T3.X, literal.x, +; EG-NEXT: NOT_INT T5.X, T3.X, +; EG-NEXT: LSHL T3.Y, PS, PV.W, +; EG-NEXT: LSHR T2.Z, T2.Z, PV.Z, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T3.W, T0.Y, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T6.X, PS, PV.W, +; EG-NEXT: OR_INT T1.Y, PV.Y, PV.Z, +; EG-NEXT: AND_INT T2.Z, T3.X, literal.x, +; EG-NEXT: AND_INT T1.W, PV.X, literal.x, +; EG-NEXT: LSHL * T3.W, T2.Y, 1, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T1.X, literal.x, -; EG-NEXT: LSHR T3.Y, T2.Y, PS, -; EG-NEXT: CNDE_INT T2.Z, T0.Z, PV.W, T1.Y, -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, T3.X, -; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, -; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, -; EG-NEXT: LSHR T4.Y, T0.Y, PV.X, -; EG-NEXT: CNDE_INT T1.Z, T1.W, T4.X, T4.Z, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T1.X, BS:VEC_102/SCL_221 -; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, +; EG-NEXT: LSHR T5.X, T2.W, T1.Z, +; EG-NEXT: AND_INT T3.Y, T3.Z, literal.x, +; EG-NEXT: LSHL T1.Z, PS, PV.W, +; EG-NEXT: LSHR T1.W, T2.X, PV.Z, +; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, +; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) +; EG-NEXT: LSHR T0.X, T0.X, PS, +; EG-NEXT: OR_INT T4.Y, PV.Z, PV.W, +; EG-NEXT: CNDE_INT T3.Z, PV.Y, T1.Y, PV.X, +; EG-NEXT: LSHR T1.W, T2.Y, T2.Z, +; EG-NEXT: AND_INT * T4.W, T3.X, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.X, PS, PV.Y, PV.W, +; EG-NEXT: OR_INT T1.Y, T6.X, PV.X, +; EG-NEXT: CNDE_INT T0.Z, T4.X, T0.Z, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T2.W, T0.Y, T2.W, +; EG-NEXT: AND_INT * T5.W, T1.X, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, PV.Y, -; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: CNDE_INT * T2.W, T0.Z, T1.Y, 0.0, +; EG-NEXT: CNDE_INT T0.X, PS, PV.Y, PV.W, +; EG-NEXT: ADD_INT T6.W, KC0[2].Y, literal.x, +; EG-NEXT: CNDE_INT * T3.W, T3.Y, T5.X, 0.0, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T0.X, PV.W, literal.x, -; EG-NEXT: CNDE_INT T2.Y, T3.W, T3.Y, 0.0, -; EG-NEXT: CNDE_INT T1.W, T1.W, T4.Z, 0.0, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR T1.X, PV.W, literal.x, +; EG-NEXT: CNDE_INT T3.Y, T4.W, T1.W, 0.0, +; EG-NEXT: CNDE_INT T0.W, T4.X, T0.W, 0.0, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T1.Y, T4.W, T4.Y, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, T5.W, T2.W, 0.0, %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 %a = load <4 x i64>, ptr addrspace(1) %in %b = load <4 x i64>, ptr addrspace(1) %b_ptr