diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -137,6 +137,10 @@ /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap WorklistMap; + /// This records all nodes attempted to add to the worklist since we + /// considered a new worklist entry. As we keep do not add duplicate nodes + /// in the worklist, this is different from the tail of the worklist. + SmallSetVector PruningList; /// Set of nodes which have been combined (at least once). /// @@ -154,6 +158,37 @@ AddToWorklist(Node); } + // Prune potentially dangling nodes. This is called after + // any visit to a node, but should also be called during a visit after any + // failed combine which may have created a DAG node. + void clearAddedDanglingWorklistEntries() { + // Check any nodes added to the worklist to see if they are prunable. + while (!PruningList.empty()) { + auto *N = PruningList.pop_back_val(); + if (N->use_empty()) + recursivelyDeleteUnusedNodes(N); + } + } + + SDNode *getNextWorklistEntry() { + // Before we do any work, remove nodes that are not in use. + clearAddedDanglingWorklistEntries(); + SDNode *N = nullptr; + // The Worklist holds the SDNodes in order, but it may contain null + // entries. + while (!N && !Worklist.empty()) { + N = Worklist.pop_back_val(); + } + + if (N) { + bool GoodWorklistEntry = WorklistMap.erase(N); + (void)GoodWorklistEntry; + assert(GoodWorklistEntry && + "Found a worklist entry without a corresponding map entry!"); + } + return N; + } + /// Call the node-specific routine that folds each particular type of node. SDValue visit(SDNode *N); @@ -182,6 +217,9 @@ if (N->getOpcode() == ISD::HANDLENODE) return; + // Mark this for potential pruning. + PruningList.insert(N); + if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); } @@ -189,6 +227,7 @@ /// Remove all instances of N from the worklist. void removeFromWorklist(SDNode *N) { CombinedNodes.erase(N); + PruningList.remove(N); auto It = WorklistMap.find(N); if (It == WorklistMap.end()) @@ -1403,19 +1442,8 @@ // changes of the root. HandleSDNode Dummy(DAG.getRoot()); - // While the worklist isn't empty, find a node and try to combine it. - while (!WorklistMap.empty()) { - SDNode *N; - // The Worklist holds the SDNodes in order, but it may contain null entries. - do { - N = Worklist.pop_back_val(); - } while (!N); - - bool GoodWorklistEntry = WorklistMap.erase(N); - (void)GoodWorklistEntry; - assert(GoodWorklistEntry && - "Found a worklist entry without a corresponding map entry!"); - + // While we have a valid worklist entry node, try to combine it. + while (SDNode *N = getNextWorklistEntry()) { // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -405,7 +405,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 ; SI-NOT: and @@ -420,7 +420,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 ; SI-NOT: and @@ -435,7 +435,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 ; SI-NOT: and @@ -450,7 +450,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 ; SI-NOT: and @@ -463,7 +463,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 ; SI-NOT: and @@ -476,7 +476,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 ; SI-NOT: and @@ -491,7 +491,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 ; SI-NOT: and @@ -506,7 +506,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 ; SI-NOT: and @@ -549,7 +549,7 @@ ; Shift into upper 32-bits ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and @@ -562,7 +562,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -181,11 +181,10 @@ } ; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16: -; VI: s_lshr_b32 s1, s0, 16 -; VI: s_add_i32 s1, s1, 1 +; VI: s_and_b32 s1, s0, 0xffff0000 ; VI: s_add_i32 s0, s0, 1 +; VI: s_add_i32 s1, s1, 0x10000 ; VI: s_and_b32 s0, s0, 0xffff -; VI: s_lshl_b32 s1, s1, 16 ; VI: s_or_b32 s0, s0, s1 ; VI: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1366,7 +1366,7 @@ ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] -; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]] +; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2178,23 +2178,23 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: s_and_b32 s3, s1, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_and_b32 s2, s0, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2202,23 +2202,23 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -201,38 +201,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s1, 8 -; GFX8-NEXT: s_lshr_b32 s6, s2, 8 -; GFX8-NEXT: s_sext_i32_i8 s4, s2 -; GFX8-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX8-NEXT: s_bfe_i32 s6, s6, 0x80000 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_sext_i32_i8 s3, s1 -; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: s_and_b32 s4, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s0, s6 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX8-NEXT: s_sext_i32_i8 s0, s2 +; GFX8-NEXT: s_sext_i32_i8 s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX8-NEXT: s_ashr_i32 s3, s3, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_and_b32 s1, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -241,38 +232,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 8 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s2, 8 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s6, 0x80000 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: s_and_b32 s4, s0, s5 -; GFX9-NODL-NEXT: s_and_b32 s5, s0, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -281,38 +263,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 8 -; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 8 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s6, 0x80000 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-DL-NEXT: s_and_b32 s4, s0, s5 -; GFX9-DL-NEXT: s_and_b32 s5, s0, s6 -; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-DL-NEXT: s_and_b32 s0, s0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -399,20 +358,20 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s1, s2 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s2, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 @@ -431,20 +390,20 @@ ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -355,20 +355,20 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s1, s2 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s2, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 @@ -387,20 +387,20 @@ ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 @@ -485,23 +485,23 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s0, s2 +; GFX8-NEXT: s_and_b32 s3, s2, s0 +; GFX8-NEXT: s_and_b32 s0, s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -509,23 +509,23 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -533,23 +533,23 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s1, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s0, s2 +; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -619,19 +619,19 @@ ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s0, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 @@ -651,19 +651,19 @@ ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 @@ -765,19 +765,19 @@ ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s3, s2, s0 -; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 @@ -797,19 +797,19 @@ ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 @@ -829,19 +829,19 @@ ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 @@ -1268,33 +1268,30 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80000 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 -; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80000 -; GFX8-NEXT: s_and_b32 s3, s2, s3 -; GFX8-NEXT: s_and_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: s_lshr_b32 s0, s0, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v4, v2 +; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_sext_i32_i8 s4, s3 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_sext_i32_i8 s1, s2 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX8-NEXT: s_lshr_b32 s3, s3, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1302,33 +1299,30 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80000 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80000 -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s1, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v4, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1336,33 +1330,30 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s3, s0, 0x80000 -; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80008 -; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x80000 -; GFX9-DL-NEXT: s_and_b32 s3, s2, s3 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v4, v2 +; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3 +; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -308,52 +308,43 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s4, 4 -; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40000 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s5, s4, 12 -; GFX8-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX8-NEXT: v_mul_i32_i24_e32 v6, s7, v6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: s_lshr_b32 s1, s4, 20 -; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40010 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v13, s5 -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s9, s4, 28 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x40018 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s9 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40018 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX8-NEXT: s_lshr_b32 s7, s4, 12 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018 +; GFX8-NEXT: s_ashr_i32 s4, s4, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s14 +; GFX8-NEXT: s_ashr_i32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v13, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -368,52 +359,43 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 4 -; GFX9-NEXT: s_lshr_b32 s1, s4, 4 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40000 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s5, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX9-NEXT: v_mul_i32_i24_e32 v6, s7, v6 -; GFX9-NEXT: s_lshr_b32 s0, s2, 20 -; GFX9-NEXT: s_lshr_b32 s1, s4, 20 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40010 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v13, s5 -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s9, s4, 28 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x40018 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s9 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40018 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-NEXT: s_lshr_b32 s7, s4, 12 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018 +; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v7, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v13, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v9, v10, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v11, v12, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -428,52 +410,43 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4 -; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 4 -; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40000 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: s_bfe_i32 s0, s4, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s5, s4, 12 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, s7, v6 -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20 -; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 20 -; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40010 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v13, s5 -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-DL-NEXT: s_lshr_b32 s9, s4, 28 -; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x40018 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s9 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x40018 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v7, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v13, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v9, v10, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v11, v12, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -622,60 +595,45 @@ ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s7, s0, 4 -; GFX8-NEXT: s_lshr_b32 s11, s1, 4 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s11 -; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40000 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: s_lshr_b32 s4, s0, 12 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX8-NEXT: s_lshr_b32 s5, s1, 12 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s6, s0, 12 -; GFX8-NEXT: s_lshr_b32 s10, s1, 12 -; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v12, s13 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s10 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: s_lshr_b32 s5, s0, 20 -; GFX8-NEXT: s_lshr_b32 s9, s1, 20 -; GFX8-NEXT: v_mul_i32_i24_e32 v5, s14, v5 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s9 -; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, s2, v7 -; GFX8-NEXT: s_lshr_b32 s8, s1, 28 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: s_lshr_b32 s4, s0, 28 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v13, s17 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s8 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40018 -; GFX8-NEXT: v_and_b32_e32 v8, s2, v8 -; GFX8-NEXT: v_and_b32_e32 v9, s2, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40018 -; GFX8-NEXT: v_and_b32_e32 v10, s2, v10 -; GFX8-NEXT: v_and_b32_e32 v11, s2, v11 +; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s13 +; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s15 +; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v12, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v13, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v8, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v10, v11, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -691,60 +649,45 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-NEXT: s_lshr_b32 s11, s1, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11 -; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40000 -; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: s_lshr_b32 s6, s0, 12 -; GFX9-NEXT: s_lshr_b32 s10, s1, 12 -; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40008 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v12, s13 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s10 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: s_lshr_b32 s5, s0, 20 -; GFX9-NEXT: s_lshr_b32 s9, s1, 20 -; GFX9-NEXT: v_mul_i32_i24_e32 v5, s14, v5 -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s9 -; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX9-NEXT: v_and_b32_e32 v7, s2, v7 -; GFX9-NEXT: s_lshr_b32 s8, s1, 28 -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: s_lshr_b32 s4, s0, 28 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s8 -; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40018 -; GFX9-NEXT: v_and_b32_e32 v8, s2, v8 -; GFX9-NEXT: v_and_b32_e32 v9, s2, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40018 -; GFX9-NEXT: v_and_b32_e32 v10, s2, v10 -; GFX9-NEXT: v_and_b32_e32 v11, s2, v11 +; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v12, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v13, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v8, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v10, v11, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -760,60 +703,45 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40000 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v12, s13 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 20 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, s14, v5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s9 -; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v7, s2, v7 -; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 28 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s8 -; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x40018 -; GFX9-DL-NEXT: v_and_b32_e32 v8, s2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v9, s2, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x40018 -; GFX9-DL-NEXT: v_and_b32_e32 v10, s2, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v11, s2, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v12, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v13, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v8, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v10, v11, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -819,35 +819,35 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -870,35 +870,35 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX9-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -921,35 +921,35 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1074,32 +1074,35 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1122,32 +1125,35 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1170,32 +1176,35 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2336,35 +2345,35 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2387,35 +2396,35 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX9-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2438,35 +2447,35 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN -; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN1 +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 +; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 declare i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -30,8 +30,12 @@ ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 16 ; FIXME: Should be using scalar instructions here. -; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 +; GCN1: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN1: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 +; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}} +; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}} +; GCN2: s_sext_i32_i16 s0, [[MAD]] +; GCN2: v_mov_b32_e32 v0, s0 define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: %0 = mul i16 %a, %b @@ -47,8 +51,12 @@ ; The result must be sign-extended ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 8 -; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 +; GCN1: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN1: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 +; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}} +; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}} +; GCN2: s_sext_i32_i8 s0, [[MAD]] +; GCN2: v_mov_b32_e32 v0, s0 define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: %0 = mul i8 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -307,10 +307,10 @@ } ; GCN-LABEL: {{^}}and_not_mask_i64: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} +; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} -; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] +; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]] ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -54,9 +54,9 @@ ; after 64-bit shift is split. ; GCN-LABEL: {{^}}lshr_and_i64_35: -; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_load_dword v[[LO:[0-9]+]] +; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -384,9 +384,11 @@ ret void } +; FIXME: This or should fold into an offset on the write ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 -; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32 +; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]] +; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -27,7 +27,6 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -439,7 +438,6 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -477,7 +475,6 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 1 ; VI-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll --- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll +++ b/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll @@ -85,10 +85,11 @@ } ; CHECK-LABEL: unsafe_add_underflow: -; CHECK: subs r0, #2 -; CHECK: uxtb [[EXT:r[0-9]+]], r0 -; CHECK: cmp [[EXT]], #255 -; CHECK: moveq r0, #8 +; CHECK: movs r1, #16 +; CHECK: cmp r0, #1 +; CHECK: it eq +; CHECK: moveq r1, #8 +; CHECK: mov r0, r1 define i32 @unsafe_add_underflow(i8 zeroext %a) { %add = add i8 %a, -2 %cmp = icmp ugt i8 %add, 254 diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -455,7 +455,6 @@ define <4 x i16> @check_i16(<8 x i16> %v) nounwind { ; CHECK-LABEL: check_i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d16, r0, r1 ; CHECK-NEXT: vdup.16 d16, d16[3] ; CHECK-NEXT: vmov r0, r1, d16 @@ -469,7 +468,6 @@ define <8 x i8> @check_i8(<16 x i8> %v) nounwind { ; CHECK-LABEL: check_i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d16, r0, r1 ; CHECK-NEXT: vdup.8 d16, d16[3] ; CHECK-NEXT: vmov r0, r1, d16 diff --git a/llvm/test/CodeGen/PowerPC/pr39478.ll b/llvm/test/CodeGen/PowerPC/pr39478.ll --- a/llvm/test/CodeGen/PowerPC/pr39478.ll +++ b/llvm/test/CodeGen/PowerPC/pr39478.ll @@ -7,8 +7,6 @@ define void @pr39478() { ; CHECK-LABEL: pr39478: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: stb 3, 0(3) ; CHECK-NEXT: blr entry: %tmp32 = load i64, i64* undef, align 8 diff --git a/llvm/test/CodeGen/PowerPC/testComparesigesll.ll b/llvm/test/CodeGen/PowerPC/testComparesigesll.ll --- a/llvm/test/CodeGen/PowerPC/testComparesigesll.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesigesll.ll @@ -99,14 +99,14 @@ ; CHECK-NEXT: blr ; CHECK-BE-LABEL: test_igesll_sext_z: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: sradi r3, r3, 63 ; CHECK-BE-NEXT: not r3, r3 +; CHECK-BE-NEXT: sradi r3, r3, 63 ; CHECK-BE-NEXT: blr ; ; CHECK-LE-LABEL: test_igesll_sext_z: ; CHECK-LE: # %bb.0: # %entry -; CHECK-LE-NEXT: sradi r3, r3, 63 ; CHECK-LE-NEXT: not r3, r3 +; CHECK-LE-NEXT: sradi r3, r3, 63 ; CHECK-LE-NEXT: blr entry: %cmp = icmp sgt i64 %a, -1 diff --git a/llvm/test/CodeGen/X86/constant-combines.ll b/llvm/test/CodeGen/X86/constant-combines.ll --- a/llvm/test/CodeGen/X86/constant-combines.ll +++ b/llvm/test/CodeGen/X86/constant-combines.ll @@ -15,12 +15,7 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: mulss %xmm0, %xmm1 ; CHECK-NEXT: movq $0, (%rdi) -; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: %0 = getelementptr inbounds { float, float }, { float, float }* %arg, i32 0, i32 1 diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -238,8 +238,8 @@ ; CHECK-NEXT: movzwl (%eax), %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD -; CHECK-NEXT: shrl $19, %ecx -; CHECK-NEXT: addl %ecx, %ecx +; CHECK-NEXT: shrl $18, %ecx +; CHECK-NEXT: andl $-2, %ecx ; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx ; CHECK-NEXT: cmpw %cx, %ax ; CHECK-NEXT: jne .LBB12_5 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -249,7 +249,7 @@ ; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X64-NEXT: movslq %edi, %rax -; X64-NEXT: vpinsrq $0, %rax, %xmm0, %xmm1 +; X64-NEXT: vmovq %rax, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll --- a/llvm/test/CodeGen/X86/legalize-shift-64.ll +++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll @@ -143,9 +143,9 @@ ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: orl $0, %eax -; CHECK-NEXT: je .LBB5_3 +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB5_3 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: jmp .LBB5_2 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2645,9 +2645,9 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0 ; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1 -; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/movmsk.ll b/llvm/test/CodeGen/X86/movmsk.ll --- a/llvm/test/CodeGen/X86/movmsk.ll +++ b/llvm/test/CodeGen/X86/movmsk.ll @@ -95,14 +95,11 @@ } ; PR11570 -; FIXME: This should also use movmskps; we don't form the FGETSIGN node -; in this case, though. define void @float_call_signbit(double %n) { ; CHECK-LABEL: float_call_signbit: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movq %xmm0, %rdi -; CHECK-NEXT: shrq $63, %rdi -; CHECK-NEXT: ## kill: def $edi killed $edi killed $rdi +; CHECK-NEXT: movmskpd %xmm0, %edi +; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL entry: %t0 = bitcast double %n to i64 diff --git a/llvm/test/CodeGen/X86/not-and-simplify.ll b/llvm/test/CodeGen/X86/not-and-simplify.ll --- a/llvm/test/CodeGen/X86/not-and-simplify.ll +++ b/llvm/test/CodeGen/X86/not-and-simplify.ll @@ -20,8 +20,9 @@ define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) { ; ALL-LABEL: shrink_xor_constant1_splat: ; ALL: # %bb.0: +; ALL-NEXT: pcmpeqd %xmm1, %xmm1 +; ALL-NEXT: pxor %xmm1, %xmm0 ; ALL-NEXT: psrld $31, %xmm0 -; ALL-NEXT: pxor {{.*}}(%rip), %xmm0 ; ALL-NEXT: retq %sh = lshr <4 x i32> %x, %not = xor <4 x i32> %sh, diff --git a/llvm/test/CodeGen/X86/pr28504.ll b/llvm/test/CodeGen/X86/pr28504.ll --- a/llvm/test/CodeGen/X86/pr28504.ll +++ b/llvm/test/CodeGen/X86/pr28504.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s ; The test case is rather involved, because we need to get to a state where @@ -7,13 +8,17 @@ ; Basically, what we want to see is that the compare result zero-extended, and ; then stored. Only one zext, and no sexts. -; CHECK-LABEL: main: -; CHECK: movzbl (%rdi), %[[EAX:.*]] -; CHECK-NEXT: xorl %e[[C:.]]x, %e[[C]]x -; CHECK-NEXT: cmpl $1, %[[EAX]] -; CHECK-NEXT: sete %[[C]]l -; CHECK-NEXT: movl %e[[C]]x, (%rsi) define void @main(i8* %p, i32* %q) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpb $1, (%rdi) +; CHECK-NEXT: sete %al +; CHECK-NEXT: movl %eax, (%rsi) +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %bb22 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: # %bb21 bb: %tmp4 = load i8, i8* %p, align 1 %tmp5 = sext i8 %tmp4 to i32 diff --git a/llvm/test/CodeGen/X86/pr33844.ll b/llvm/test/CodeGen/X86/pr33844.ll --- a/llvm/test/CodeGen/X86/pr33844.ll +++ b/llvm/test/CodeGen/X86/pr33844.ll @@ -10,15 +10,7 @@ define void @patatino() { ; CHECK-LABEL: patatino: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movl {{.*}}(%rip), %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $31, %ecx -; CHECK-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-NEXT: shrl $31, %ecx -; CHECK-NEXT: andl $-2, %ecx -; CHECK-NEXT: andl $-536870912, %eax # imm = 0xE0000000 -; CHECK-NEXT: orl %ecx, %eax -; CHECK-NEXT: movl %eax, {{.*}}(%rip) +; CHECK-NEXT: andl $-536870912, {{.*}}(%rip) # imm = 0xE0000000 ; CHECK-NEXT: retq bb: %tmp = load i32, i32* @global diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -396,14 +396,14 @@ define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: andpd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] +; X86-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; X64-NEXT: andpd {{.*}}(%rip), %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] +; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, <4 x float>* undef, align 16 diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -5,20 +5,10 @@ define i32 @test0(<1 x i64>* %v4) nounwind { ; X32-LABEL: test0: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X32-NEXT: movl %ecx, (%esp) -; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3] +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pshufw $238, (%eax), %mm0 # mm0 = mem[2,3,2,3] ; X32-NEXT: movd %mm0, %eax ; X32-NEXT: addl $32, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; X64-LABEL: test0: