Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1308,98 +1308,106 @@ LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; - // Add all the dag nodes to the worklist. - for (SDNode &Node : DAG.allnodes()) - AddToWorklist(&Node); - // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any // changes of the root. HandleSDNode Dummy(DAG.getRoot()); - // While the worklist isn't empty, find a node and try to combine it. - while (!WorklistMap.empty()) { - SDNode *N; - // The Worklist holds the SDNodes in order, but it may contain null entries. - do { - N = Worklist.pop_back_val(); - } while (!N); - - bool GoodWorklistEntry = WorklistMap.erase(N); - (void)GoodWorklistEntry; - assert(GoodWorklistEntry && - "Found a worklist entry without a corresponding map entry!"); - - // If N has no uses, it is dead. Make sure to revisit all N's operands once - // N is deleted from the DAG, since they too may now be dead or may have a - // reduced number of uses, allowing other xforms. - if (recursivelyDeleteUnusedNodes(N)) - continue; + for (unsigned Iteration = 0; Iteration < 3; Iteration++) { + // Add all the dag nodes to the worklist. + for (SDNode &Node : DAG.allnodes()) + AddToWorklist(&Node); - WorklistRemover DeadNodes(*this); + bool Changed = false; + + // While the worklist isn't empty, find a node and try to combine it. + while (!WorklistMap.empty()) { + SDNode *N; + // The Worklist holds the SDNodes in order, but it may contain null entries. + do { + N = Worklist.pop_back_val(); + } while (!N); + + bool GoodWorklistEntry = WorklistMap.erase(N); + (void)GoodWorklistEntry; + assert(GoodWorklistEntry && + "Found a worklist entry without a corresponding map entry!"); + + // If N has no uses, it is dead. Make sure to revisit all N's operands once + // N is deleted from the DAG, since they too may now be dead or may have a + // reduced number of uses, allowing other xforms. + if (recursivelyDeleteUnusedNodes(N)) + continue; + + WorklistRemover DeadNodes(*this); - // If this combine is running after legalizing the DAG, re-legalize any - // nodes pulled off the worklist. - if (Level == AfterLegalizeDAG) { - SmallSetVector UpdatedNodes; - bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); + // If this combine is running after legalizing the DAG, re-legalize any + // nodes pulled off the worklist. + if (Level == AfterLegalizeDAG) { + SmallSetVector UpdatedNodes; + bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); - for (SDNode *LN : UpdatedNodes) { - AddToWorklist(LN); - AddUsersToWorklist(LN); + for (SDNode *LN : UpdatedNodes) { + AddToWorklist(LN); + AddUsersToWorklist(LN); + } + if (!NIsValid) + continue; } - if (!NIsValid) - continue; - } - DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); + DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); - // Add any operands of the new node which have not yet been combined to the - // worklist as well. Because the worklist uniques things already, this - // won't repeatedly process the same operand. - CombinedNodes.insert(N); - for (const SDValue &ChildN : N->op_values()) - if (!CombinedNodes.count(ChildN.getNode())) - AddToWorklist(ChildN.getNode()); + // Add any operands of the new node which have not yet been combined to + // the worklist as well. Because the worklist uniques things already, + // this won't repeatedly process the same operand. + CombinedNodes.insert(N); + for (const SDValue &ChildN : N->op_values()) + if (!CombinedNodes.count(ChildN.getNode())) + AddToWorklist(ChildN.getNode()); - SDValue RV = combine(N); + SDValue RV = combine(N); - if (!RV.getNode()) - continue; + if (!RV.getNode()) + continue; - ++NodesCombined; + ++NodesCombined; + Changed = true; - // If we get back the same node we passed in, rather than a new node or - // zero, we know that the node must have defined multiple values and - // CombineTo was used. Since CombineTo takes care of the worklist - // mechanics for us, we have no work to do in this case. - if (RV.getNode() == N) - continue; + // If we get back the same node we passed in, rather than a new node or + // zero, we know that the node must have defined multiple values and + // CombineTo was used. Since CombineTo takes care of the worklist + // mechanics for us, we have no work to do in this case. + if (RV.getNode() == N) + continue; - assert(N->getOpcode() != ISD::DELETED_NODE && - RV.getOpcode() != ISD::DELETED_NODE && - "Node was deleted but visit returned new node!"); + assert(N->getOpcode() != ISD::DELETED_NODE && + RV.getOpcode() != ISD::DELETED_NODE && + "Node was deleted but visit returned new node!"); - DEBUG(dbgs() << " ... into: "; - RV.getNode()->dump(&DAG)); + DEBUG(dbgs() << " ... into: "; + RV.getNode()->dump(&DAG)); - if (N->getNumValues() == RV.getNode()->getNumValues()) - DAG.ReplaceAllUsesWith(N, RV.getNode()); - else { - assert(N->getValueType(0) == RV.getValueType() && - N->getNumValues() == 1 && "Type mismatch"); - DAG.ReplaceAllUsesWith(N, &RV); - } + if (N->getNumValues() == RV.getNode()->getNumValues()) + DAG.ReplaceAllUsesWith(N, RV.getNode()); + else { + assert(N->getValueType(0) == RV.getValueType() && + N->getNumValues() == 1 && "Type mismatch"); + DAG.ReplaceAllUsesWith(N, &RV); + } - // Push the new node and any users onto the worklist - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); + // Push the new node and any users onto the worklist + AddToWorklist(RV.getNode()); + AddUsersToWorklist(RV.getNode()); - // Finally, if the node is now dead, remove it from the graph. The node - // may not be dead if the replacement process recursively simplified to - // something else needing this node. This will also take care of adding any - // operands which have lost a user to the worklist. - recursivelyDeleteUnusedNodes(N); + // Finally, if the node is now dead, remove it from the graph. The node + // may not be dead if the replacement process recursively simplified to + // something else needing this node. This will also take care of adding any + // operands which have lost a user to the worklist. + recursivelyDeleteUnusedNodes(N); + } + + if (!Changed) + break; } // If the root changed (e.g. it was a dead load, update the root). @@ -5743,7 +5751,6 @@ if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); - // If the sign bit is known to be zero, switch this to a SRL. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); Index: test/CodeGen/AArch64/arm64-narrow-st-merge.ll =================================================================== --- test/CodeGen/AArch64/arm64-narrow-st-merge.ll +++ test/CodeGen/AArch64/arm64-narrow-st-merge.ll @@ -19,7 +19,7 @@ } ; CHECK-LABEL: Strh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: str xzr ; CHECK-STRICT-LABEL: Strh_zero_4 ; CHECK-STRICT: strh wzr ; CHECK-STRICT: strh wzr @@ -137,7 +137,7 @@ } ; CHECK-LABEL: Sturh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: stur xzr ; CHECK-STRICT-LABEL: Sturh_zero_4 ; CHECK-STRICT: sturh wzr ; CHECK-STRICT: sturh wzr Index: test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -99,9 +99,9 @@ ; __stack field should point just past them. define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) { ; CHECK-LABEL: test_offsetstack: -; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]! ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var +; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp] ; CHECK: str [[STACK_TOP]], [x[[VAR]]] %addr = bitcast %va_list* @var to i8* Index: test/CodeGen/AArch64/fold-constants.ll =================================================================== --- test/CodeGen/AArch64/fold-constants.ll +++ test/CodeGen/AArch64/fold-constants.ll @@ -20,8 +20,8 @@ ; PR25763 - folding constant vector comparisons with sign-extended result define <8 x i16> @dotests_458() { ; CHECK-LABEL: dotests_458 -; CHECK: movi d0, #0x00000000ff0000 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK: adrp x8, .LCPI1_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: ret entry: %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> , i1 false) #6 Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -382,7 +382,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 ; SI-NOT: and @@ -397,7 +397,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 ; SI-NOT: and @@ -412,7 +412,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 ; SI-NOT: and @@ -427,7 +427,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 ; SI-NOT: and @@ -440,7 +440,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 ; SI-NOT: and @@ -453,7 +453,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 ; SI-NOT: and @@ -468,7 +468,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 ; SI-NOT: and @@ -483,7 +483,7 @@ ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 ; SI-NOT: and @@ -526,7 +526,7 @@ ; Shift into upper 32-bits ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and @@ -539,7 +539,7 @@ ; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64: ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and Index: test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll =================================================================== --- test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -7,7 +7,6 @@ ; GCN-LABEL: {{^}}vcc_shrink_vcc_def: ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { bb0: %tmp = icmp sgt i32 %arg1, 4 @@ -34,7 +33,6 @@ ; GCN-LABEL: {{^}}preserve_condition_undef_flag: ; GCN-NOT: vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -93,14 +93,11 @@ ; GCN-DAG: v_cvt_f32_ubyte2_e32 ; GCN-DAG: v_cvt_f32_ubyte3_e32 -; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 - ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 -; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, -; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, ; SI-DAG: v_add_i32 +; VI-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00, ; VI-DAG: v_add_u16_e32 ; VI-DAG: v_add_u16_e32 Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -1004,7 +1004,7 @@ ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] -; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]] +; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] ; GCN: buffer_store_dword [[NEG_MAD]] Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -137,8 +137,7 @@ ; v2i16 is naturally 4 byte aligned ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal -; EG: 16 +; EG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EG: 16 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in @@ -156,8 +155,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal -; TODO: We should use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal +; EG-DAG: ASHR {{[* ]*}}[[ST]].Y, {{.*}}, literal ; EG-DAG: 16 ; EG-DAG: 16 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { @@ -221,8 +219,8 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}} ; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 ; TODO: This should use LD, but for some there are redundant MOVs -; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal -; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal @@ -247,9 +245,8 @@ ; TODO: This should use LD, but for some there are redundant MOVs ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal -; TODO: We should use ASHR instead of LSHR + BFE -; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal +; EG-DAG: ASHR {{[* ]*}}[[ST]].Y, {{.*}}, literal +; EG-DAG: ASHR {{[* ]*}}[[ST]].W, {{.*}}, literal ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 @@ -270,12 +267,11 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, ; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use LSHR instead of BFE_UINT ; TODO: This should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal -; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal -; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal -; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EG-DAG: LSHR {{[* ]*}}[[ST_HI]].W, {{.*}}, literal ; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal ; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal @@ -304,12 +300,11 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, ; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 -; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT ; TODO: This should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal +; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EG-DAG: ASHR {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EG-DAG: ASHR {{[* ]*}}[[ST_HI]].W, {{.*}}, literal ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal Index: test/CodeGen/AMDGPU/load-constant-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i8.ll +++ test/CodeGen/AMDGPU/load-constant-i8.ll @@ -346,22 +346,22 @@ ; EG: VTX_READ_128 [[DST:T[0-9]+\.XYZW]], T{{[0-9]+}}.X, 0, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -146,7 +146,7 @@ ; GCN-HSA: flat_load_dword ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal +; EGCM: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EGCM: 16 define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -164,8 +164,8 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} ; EGCM: VTX_READ_32 [[DST:T[0-9].[XYZW]]], [[DST]], 0, #1 ; TODO: This should use ASHR instead of LSHR + BFE +; EGCM-DAG: ASHR {{[* ]*}}[[ST]].Y, {{.*}}, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { @@ -231,9 +231,9 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, ; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 ; TODO: This should use DST, but for some there are redundant MOVs -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST]].Y, {{.*}}, literal ; EGCM-DAG: 16 -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST]].W, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal ; EGCM-DAG: 16 @@ -254,10 +254,10 @@ ; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 ; TODO: We should use ASHR instead of LSHR + BFE ; TODO: This should use DST, but for some there are redundant MOVs +; EGCM-DAG: ASHR {{[* ]*}}[[ST]].Y, {{.*}}, literal +; EGCM-DAG: ASHR {{[* ]*}}[[ST]].W, {{.*}}, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 @@ -279,11 +279,10 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, ; EGCM: CF_END ; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use LSHR instead of BFE_UINT -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal -; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EGCM-DAG: LSHR {{[* ]*}}[[ST_HI]].W, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal @@ -313,11 +312,10 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, ; EGCM: CF_END ; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 -; TODO: These should use ASHR instead of LSHR + BFE_INT -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal +; EGCM-DAG: ASHR {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal +; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].W, {{.*}}, literal +; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal +; EGCM-DAG: ASHR {{[* ]*}}[[ST_HI]].W, {{.*}}, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal Index: test/CodeGen/AMDGPU/load-global-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i8.ll +++ test/CodeGen/AMDGPU/load-global-i8.ll @@ -352,22 +352,22 @@ ; EG: VTX_READ_128 [[DST:T[0-9]+\.XYZW]], T{{[0-9]+}}.X, 0, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9]+.[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -180,7 +180,6 @@ ; EG: LDS_READ_RET ; EG: BFE_INT -; EG: BFE_INT define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = sext <2 x i16> %load to <2 x i32> Index: test/CodeGen/AMDGPU/r600.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/r600.bitcast.ll +++ test/CodeGen/AMDGPU/r600.bitcast.ll @@ -82,8 +82,7 @@ ; FUNC-LABEL: {{^}}v4i16_extract_i8: ; EG: MEM_RAT MSKOR {{T[0-9]+\.XW}}, [[ST_PTR:T[0-9]+\.[XYZW]]] ; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG-DAG: BFE_UINT +; EG: LSHR {{[* ]*}}T{{[0-9]+}}.W, T{{[0-9]+}}.X, literal ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 2 Index: test/CodeGen/AMDGPU/setcc.ll =================================================================== --- test/CodeGen/AMDGPU/setcc.ll +++ test/CodeGen/AMDGPU/setcc.ll @@ -397,9 +397,9 @@ } ; FUNC-LABEL: setcc-i1-and-xor -; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 -; GCN: s_and_b64 s[2:3], [[A]], [[B]] +; GCN-DAG: v_cmp_nge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} +; GCN-DAG: v_cmp_nle_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 +; GCN: s_or_b64 vcc, [[A]], [[B]] define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 { bb0: %tmp5 = fcmp oge float %cond, 0.000000e+00 Index: test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -306,7 +306,7 @@ } ; GCN-LABEL: {{^}}and_not_mask_i64: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} +; GCN-DAG: buffer_load_dword v[[VALLO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -54,9 +54,9 @@ ; after 64-bit shift is split. ; GCN-LABEL: {{^}}lshr_and_i64_35: -; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23 +; GCN: buffer_load_dword v[[HI:[0-9]+]] ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in Index: test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll =================================================================== --- test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll +++ test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll @@ -4,7 +4,7 @@ define void @vst(i8* %m, [4 x i64] %v) { entry: ; CHECK: vst: -; CHECK: VST1d64Q %R{{[0-9]+}}, 8, %D{{[0-9]+}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}} +; CHECK: VST1d64Q %R{{[0-9]+}}, 8, %D{{[0-9]+}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}} %v0 = extractvalue [4 x i64] %v, 0 %v1 = extractvalue [4 x i64] %v, 1 Index: test/CodeGen/ARM/vector-DAGCombine.ll =================================================================== --- test/CodeGen/ARM/vector-DAGCombine.ll +++ test/CodeGen/ARM/vector-DAGCombine.ll @@ -48,7 +48,6 @@ %2 = bitcast double %1 to i64 %3 = insertelement <1 x i64> undef, i64 %2, i32 0 ; CHECK-NOT: vmov s -; CHECK: vext.8 %4 = shufflevector <1 x i64> %3, <1 x i64> undef, <2 x i32> %tmp2006.3 = bitcast <2 x i64> %4 to <16 x i8> %5 = shufflevector <16 x i8> %tmp2006.3, <16 x i8> undef, <16 x i32> Index: test/CodeGen/PowerPC/no-pref-jumps.ll =================================================================== --- test/CodeGen/PowerPC/no-pref-jumps.ll +++ test/CodeGen/PowerPC/no-pref-jumps.ll @@ -13,7 +13,7 @@ ; CHECK-LABEL: @foo ; CHECK: cmpwi ; CHECK: cmpwi -; CHECK: cror +; CHECK: crand ; CHECK: blr if.then: ; preds = %entry Index: test/CodeGen/SPARC/32abi.ll =================================================================== --- test/CodeGen/SPARC/32abi.ll +++ test/CodeGen/SPARC/32abi.ll @@ -201,8 +201,7 @@ ; CHECK-BE-NEXT: addcc %i2, %i1, %i1 ; CHECK-BE-NEXT: addxcc %i0, 0, %i0 ; -; CHECK-LE: ld [%fp+96], %g2 -; CHECK-LE-NEXT: ld [%fp+100], %g3 +; CHECK-LE: ldd [%fp+96], %g2 ; CHECK-LE-NEXT: ld [%fp+92], %g4 ; CHECK-LE-NEXT: addcc %i0, %i2, %i0 ; CHECK-LE-NEXT: addxcc %i1, 0, %i1 Index: test/CodeGen/SystemZ/selectcc-01.ll =================================================================== --- test/CodeGen/SystemZ/selectcc-01.ll +++ test/CodeGen/SystemZ/selectcc-01.ll @@ -69,8 +69,8 @@ define i32 @f6(float %a, float %b) { ; CHECK-LABEL: f6: ; CHECK: ipm %r2 -; CHECK-NEXT: afi %r2, 268435456 ; CHECK-NEXT: sll %r2, 2 +; CHECK-NEXT: afi %r2, 1073741824 ; CHECK-NEXT: sra %r2, 31 ; CHECK: br %r14 %cond = fcmp one float %a, %b @@ -106,8 +106,8 @@ define i32 @f9(float %a, float %b) { ; CHECK-LABEL: f9: ; CHECK: ipm %r2 -; CHECK-NEXT: afi %r2, -268435456 ; CHECK-NEXT: sll %r2, 2 +; CHECK-NEXT: afi %r2, -1073741824 ; CHECK-NEXT: sra %r2, 31 ; CHECK: br %r14 %cond = fcmp ueq float %a, %b Index: test/CodeGen/SystemZ/selectcc-02.ll =================================================================== --- test/CodeGen/SystemZ/selectcc-02.ll +++ test/CodeGen/SystemZ/selectcc-02.ll @@ -68,8 +68,8 @@ define i32 @f6(float %a, float %b) { ; CHECK-LABEL: f6: ; CHECK: ipm %r2 -; CHECK-NEXT: afi %r2, -268435456 ; CHECK-NEXT: sll %r2, 2 +; CHECK-NEXT: afi %r2, -1073741824 ; CHECK-NEXT: sra %r2, 31 ; CHECK: br %r14 %cond = fcmp one float %a, %b @@ -105,8 +105,8 @@ define i32 @f9(float %a, float %b) { ; CHECK-LABEL: f9: ; CHECK: ipm %r2 -; CHECK-NEXT: afi %r2, 268435456 ; CHECK-NEXT: sll %r2, 2 +; CHECK-NEXT: afi %r2, 1073741824 ; CHECK-NEXT: sra %r2, 31 ; CHECK: br %r14 %cond = fcmp ueq float %a, %b Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -194,10 +194,10 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: shuffle_v16i16_4501_mem: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps (%rsi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_4501_mem: Index: test/CodeGen/X86/avx512-any_extend_load.ll =================================================================== --- test/CodeGen/X86/avx512-any_extend_load.ll +++ test/CodeGen/X86/avx512-any_extend_load.ll @@ -31,7 +31,7 @@ ; KNL-LABEL: any_extend_load_v8i32: ; KNL: # BB#0: ; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -15,13 +15,8 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movl $0, 4(%rdi) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movl $0, (%rdi) -; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: %0 = getelementptr inbounds { float, float }, { float, float }* %arg, i32 0, i32 1 Index: test/CodeGen/X86/divide-by-constant.ll =================================================================== --- test/CodeGen/X86/divide-by-constant.ll +++ test/CodeGen/X86/divide-by-constant.ll @@ -48,7 +48,6 @@ ; X32: # BB#0: # %entry ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: imull $171, %eax, %eax -; X32-NEXT: andl $65024, %eax # imm = 0xFE00 ; X32-NEXT: shrl $9, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl @@ -56,7 +55,6 @@ ; X64-LABEL: test3: ; X64: # BB#0: # %entry ; X64-NEXT: imull $171, %esi, %eax -; X64-NEXT: andl $65024, %eax # imm = 0xFE00 ; X64-NEXT: shrl $9, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq @@ -167,7 +165,6 @@ ; X32-NEXT: shrb %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: imull $211, %eax, %eax -; X32-NEXT: andl $24576, %eax # imm = 0x6000 ; X32-NEXT: shrl $13, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl @@ -177,7 +174,6 @@ ; X64-NEXT: shrb %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $211, %eax, %eax -; X64-NEXT: andl $24576, %eax # imm = 0x6000 ; X64-NEXT: shrl $13, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq @@ -192,7 +188,6 @@ ; X32-NEXT: shrb $2, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: imull $71, %eax, %eax -; X32-NEXT: andl $6144, %eax # imm = 0x1800 ; X32-NEXT: shrl $11, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl @@ -202,7 +197,6 @@ ; X64-NEXT: shrb $2, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $71, %eax, %eax -; X64-NEXT: andl $6144, %eax # imm = 0x1800 ; X64-NEXT: shrl $11, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -5,12 +5,8 @@ ; CHECK-LABEL: i24_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -22,13 +18,9 @@ ; CHECK-LABEL: i24_and_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 @@ -42,14 +34,10 @@ ; CHECK: # BB#0: ; CHECK-NEXT: movzbl %sil, %eax ; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx ; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: movw %dx, (%rdi) +; CHECK-NEXT: andl $57343, %ecx # imm = 0xDFFF +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movw %cx, (%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 @@ -64,18 +52,8 @@ ; CHECK-LABEL: i56_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) +; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: orl $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -87,20 +65,11 @@ ; CHECK-LABEL: i56_and_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rdx, %rax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movl $384, %ecx # imm = 0x180 +; CHECK-NEXT: orl (%rdi), %ecx +; CHECK-NEXT: andl $-128, %ecx ; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: retq %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 @@ -112,23 +81,14 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: movl (%rdi), %esi -; CHECK-NEXT: movb %dl, 6(%rdi) -; CHECK-NEXT: # kill: %EDX %EDX %RDX %RDX -; CHECK-NEXT: shll $16, %edx +; CHECK-NEXT: movzwl 4(%rdi), %eax +; CHECK-NEXT: movl $-8193, %ecx # imm = 0xDFFF +; CHECK-NEXT: andl (%rdi), %ecx +; CHECK-NEXT: movzbl %sil, %edx +; CHECK-NEXT: shll $13, %edx ; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: orq %rdx, %rsi -; CHECK-NEXT: shlq $13, %rax -; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: movl %edx, (%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/X86/known-bits.ll =================================================================== --- test/CodeGen/X86/known-bits.ll +++ test/CodeGen/X86/known-bits.ll @@ -12,7 +12,6 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movzbl (%eax), %eax ; X32-NEXT: imull $101, %eax, %eax -; X32-NEXT: andl $16384, %eax # imm = 0x4000 ; X32-NEXT: shrl $14, %eax ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: vmovd %eax, %xmm0 @@ -50,7 +49,6 @@ ; X64: # BB#0: # %BB ; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: imull $101, %eax, %eax -; X64-NEXT: andl $16384, %eax # imm = 0x4000 ; X64-NEXT: shrl $14, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: vmovd %eax, %xmm0 Index: test/CodeGen/X86/legalize-shift-64.ll =================================================================== --- test/CodeGen/X86/legalize-shift-64.ll +++ test/CodeGen/X86/legalize-shift-64.ll @@ -150,26 +150,15 @@ ; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $1, (%esp) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB5_3 +; CHECK-NEXT: # BB#1: # %if.then ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: shldl $32, %eax, %ecx -; CHECK-NEXT: movb $32, %dl -; CHECK-NEXT: testb %dl, %dl -; CHECK-NEXT: jne .LBB5_2 -; CHECK-NEXT: # BB#1: -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: .LBB5_2: -; CHECK-NEXT: sete %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: xorl $1, %eax -; CHECK-NEXT: orl %ecx, %eax -; CHECK-NEXT: je .LBB5_5 -; CHECK-NEXT: # BB#3: # %if.then -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: jmp .LBB5_4 -; CHECK-NEXT: .LBB5_5: # %if.end +; CHECK-NEXT: jmp .LBB5_2 +; CHECK-NEXT: .LBB5_3: # %if.end ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: .LBB5_4: # %if.then +; CHECK-NEXT: .LBB5_2: # %if.then ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl Index: test/CodeGen/X86/movmsk.ll =================================================================== --- test/CodeGen/X86/movmsk.ll +++ test/CodeGen/X86/movmsk.ll @@ -100,9 +100,8 @@ define void @float_call_signbit(double %n) { ; CHECK-LABEL: float_call_signbit: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movq %xmm0, %rdi -; CHECK-NEXT: shrq $63, %rdi -; CHECK-NEXT: ## kill: %EDI %EDI %RDI +; CHECK-NEXT: movmskpd %xmm0, %edi +; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL entry: %t0 = bitcast double %n to i64 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -233,42 +233,43 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind { ; SSE2-LABEL: v7i8: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i8: ; SSE42: # BB#0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi) ; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX-LABEL: v7i8: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) ; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX-NEXT: vmovd %xmm0, (%rdi) Index: test/CodeGen/X86/or-branch.ll =================================================================== --- test/CodeGen/X86/or-branch.ll +++ test/CodeGen/X86/or-branch.ll @@ -19,11 +19,10 @@ ; JUMP1-LABEL: foo: ; JUMP1: # BB#0: # %entry ; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; JUMP1-NEXT: sete %al -; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) -; JUMP1-NEXT: setl %cl -; JUMP1-NEXT: orb %al, %cl -; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: setne %al +; JUMP1-NEXT: cmpl $4, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setg %cl +; JUMP1-NEXT: testb %al, %cl ; JUMP1-NEXT: jne .LBB0_1 ; JUMP1-NEXT: # BB#2: # %cond_true ; JUMP1-NEXT: jmp bar # TAILCALL @@ -50,11 +49,10 @@ ; JUMP2-LABEL: unpredictable: ; JUMP2: # BB#0: # %entry ; JUMP2-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; JUMP2-NEXT: sete %al -; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp) -; JUMP2-NEXT: setl %cl -; JUMP2-NEXT: orb %al, %cl -; JUMP2-NEXT: cmpb $1, %cl +; JUMP2-NEXT: setne %al +; JUMP2-NEXT: cmpl $4, {{[0-9]+}}(%esp) +; JUMP2-NEXT: setg %cl +; JUMP2-NEXT: testb %al, %cl ; JUMP2-NEXT: jne .LBB1_1 ; JUMP2-NEXT: # BB#2: # %cond_true ; JUMP2-NEXT: jmp bar # TAILCALL @@ -64,11 +62,10 @@ ; JUMP1-LABEL: unpredictable: ; JUMP1: # BB#0: # %entry ; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; JUMP1-NEXT: sete %al -; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) -; JUMP1-NEXT: setl %cl -; JUMP1-NEXT: orb %al, %cl -; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: setne %al +; JUMP1-NEXT: cmpl $4, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setg %cl +; JUMP1-NEXT: testb %al, %cl ; JUMP1-NEXT: jne .LBB1_1 ; JUMP1-NEXT: # BB#2: # %cond_true ; JUMP1-NEXT: jmp bar # TAILCALL Index: test/CodeGen/X86/popcnt.ll =================================================================== --- test/CodeGen/X86/popcnt.ll +++ test/CodeGen/X86/popcnt.ll @@ -71,7 +71,6 @@ ; X32-NEXT: andl $13107, %eax # imm = 0x3333 ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $32752, %ecx # imm = 0x7FF0 ; X32-NEXT: shrl $4, %ecx ; X32-NEXT: addl %eax, %ecx ; X32-NEXT: andl $3855, %ecx # imm = 0xF0F @@ -94,7 +93,6 @@ ; X64-NEXT: andl $13107, %edi # imm = 0x3333 ; X64-NEXT: addl %eax, %edi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $32752, %eax # imm = 0x7FF0 ; X64-NEXT: shrl $4, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F Index: test/CodeGen/X86/sse3.ll =================================================================== --- test/CodeGen/X86/sse3.ll +++ test/CodeGen/X86/sse3.ll @@ -270,8 +270,7 @@ define <4 x i32> @t17() nounwind { ; X64-LABEL: t17: ; X64: ## BB#0: ## %entry -; X64-NEXT: movaps (%rax), %xmm0 -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,1,1] ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq Index: test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- test/CodeGen/X86/urem-i8-constant.ll +++ test/CodeGen/X86/urem-i8-constant.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s define i8 @foo(i8 %tmp325) { @@ -6,7 +6,6 @@ ; CHECK: # BB#0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: imull $111, %ecx, %eax -; CHECK-NEXT: andl $28672, %eax # imm = 0x7000 ; CHECK-NEXT: shrl $12, %eax ; CHECK-NEXT: movb $37, %dl ; CHECK-NEXT: # kill: %AL %AL %EAX @@ -14,7 +13,6 @@ ; CHECK-NEXT: subb %al, %cl ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl -; %t546 = urem i8 %tmp325, 37 ret i8 %t546 } Index: test/CodeGen/X86/vec_extract-mmx.ll =================================================================== --- test/CodeGen/X86/vec_extract-mmx.ll +++ test/CodeGen/X86/vec_extract-mmx.ll @@ -5,20 +5,10 @@ define i32 @test0(<1 x i64>* %v4) nounwind { ; X32-LABEL: test0: ; X32: # BB#0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X32-NEXT: movl %ecx, (%esp) -; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3] +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pshufw $238, (%eax), %mm0 # mm0 = mem[2,3,2,3] ; X32-NEXT: movd %mm0, %eax ; X32-NEXT: addl $32, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; X64-LABEL: test0: Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -1518,7 +1518,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; SSE2-LABEL: load_sext_4i1_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 @@ -1544,7 +1544,7 @@ ; ; SSSE3-LABEL: load_sext_4i1_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movl (%rdi), %eax +; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 @@ -1570,7 +1570,7 @@ ; ; SSE41-LABEL: load_sext_4i1_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movl (%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %eax ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl %ecx ; SSE41-NEXT: movd %eax, %xmm1 @@ -2191,23 +2191,13 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrl $7, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx @@ -2217,15 +2207,24 @@ ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: shrl $6, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl $7, %eax -; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: shrl $4, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] @@ -2240,23 +2239,13 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrl $7, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx @@ -2266,15 +2255,24 @@ ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: shrl $6, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl $7, %eax -; SSSE3-NEXT: movzwl %ax, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: shrl $4, %eax +; SSSE3-NEXT: andl $1, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] @@ -2316,7 +2314,6 @@ ; SSE41-NEXT: andl $1, %ecx ; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 ; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: movzwl %ax, %eax ; SSE41-NEXT: pinsrw $7, %eax, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: pslld $31, %xmm0 @@ -3002,52 +2999,32 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $14, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrl $15, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx +; SSE2-NEXT: shrl $7, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $10, %ecx +; SSE2-NEXT: shrl $11, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $12, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $13, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $9, %ecx ; SSE2-NEXT: andl $1, %ecx @@ -3057,26 +3034,45 @@ ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $11, %ecx +; SSE2-NEXT: shrl $14, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $6, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: shrl $10, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx +; SSE2-NEXT: shrl $12, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $4, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: shrl $15, %eax -; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -3091,52 +3087,32 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $14, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrl $15, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx +; SSSE3-NEXT: shrl $7, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $10, %ecx +; SSSE3-NEXT: shrl $11, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $12, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $8, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $13, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $9, %ecx ; SSSE3-NEXT: andl $1, %ecx @@ -3146,26 +3122,45 @@ ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $11, %ecx +; SSSE3-NEXT: shrl $14, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $6, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: shrl $10, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx +; SSSE3-NEXT: shrl $12, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $4, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: shrl $15, %eax -; SSSE3-NEXT: movzwl %ax, %eax +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: shrl $8, %eax +; SSSE3-NEXT: andl $1, %eax ; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -3239,7 +3234,6 @@ ; SSE41-NEXT: andl $1, %ecx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 ; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: movzwl %ax, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: psllw $15, %xmm0