Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18558,16 +18558,25 @@ unsigned DestSrcRatio = DestNumElts / SrcNumElts; if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) { unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio; - EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - SrcVT.getScalarType(), NewExtNumElts); - if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && - TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { - unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + EVT ScalarVT = SrcVT.getScalarType(); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0) { SDLoc DL(N); - SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); - SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - V.getOperand(0), NewIndex); - return DAG.getBitcast(NVT, NewExtract); + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + ScalarVT, NewExtNumElts); + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } + if (NewExtNumElts == 1 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) { + SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } } } } Index: llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -5,9 +5,8 @@ ; GFX9-LABEL: shuffle_v4f16_23uu: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -19,10 +18,10 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -154,7 +153,7 @@ ; GFX9-LABEL: shuffle_v4f16_0101: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -181,9 +180,8 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -196,11 +194,9 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -226,9 +222,9 @@ ; GFX9-LABEL: shuffle_v4f16_2323: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -240,10 +236,8 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -256,10 +250,9 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -271,10 +264,11 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -286,10 +280,11 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -301,7 +296,7 @@ ; GFX9-LABEL: shuffle_v4f16_4545: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -328,11 +323,11 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -344,10 +339,11 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -373,9 +369,9 @@ ; GFX9-LABEL: shuffle_v4f16_6767: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -388,13 +384,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -407,11 +402,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -485,13 +481,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -503,11 +498,9 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -590,12 +583,11 @@ ; GFX9-LABEL: shuffle_v4f16_2333: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -607,12 +599,11 @@ ; GFX9-LABEL: shuffle_v4f16_6667: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 Index: llvm/test/CodeGen/ARM/vdup.ll =================================================================== --- llvm/test/CodeGen/ARM/vdup.ll +++ llvm/test/CodeGen/ARM/vdup.ll @@ -429,8 +429,8 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind { ; CHECK-LABEL: check_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vdup.32 d16, d17[1] +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vdup.32 d16, d16[1] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x float> %v, i32 3 @@ -442,8 +442,8 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind { ; CHECK-LABEL: check_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vdup.32 d16, d17[1] +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vdup.32 d16, d16[1] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x i32> %v, i32 3 Index: llvm/test/CodeGen/ARM/vext.ll =================================================================== --- llvm/test/CodeGen/ARM/vext.ll +++ llvm/test/CodeGen/ARM/vext.ll @@ -183,10 +183,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vext.16 d16, d16, d17, #3 -; CHECK-NEXT: vorr d17, d16, d16 -; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vuzp.16 d16, d17 -; CHECK-NEXT: vzip.16 d16, d18 +; CHECK-NEXT: vorr d18, d16, d16 +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vuzp.16 d16, d18 +; CHECK-NEXT: vzip.16 d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A @@ -199,10 +199,10 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: test_undef: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r1] -; CHECK-NEXT: vldr d17, [r0, #8] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0, #8] ; CHECK-NEXT: vzip.16 d17, d16 -; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -216,17 +216,15 @@ define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { ; CHECK-LABEL: test_multisource: ; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d18, [r0, #32] ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vorr d22, d18, d18 ; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vorr d24, d20, d20 -; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128] -; CHECK-NEXT: vzip.16 d24, d18 -; CHECK-NEXT: vtrn.16 q8, q11 -; CHECK-NEXT: vext.16 d18, d20, d24, #2 +; CHECK-NEXT: vldr d19, [r0, #48] +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] +; CHECK-NEXT: vzip.16 d22, d19 +; CHECK-NEXT: vtrn.16 q8, q10 +; CHECK-NEXT: vext.16 d18, d18, d22, #2 ; CHECK-NEXT: vext.16 d16, d18, d16, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #2 ; CHECK-NEXT: vmov r0, r1, d16 @@ -259,24 +257,24 @@ define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: test_illegal: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vorr d22, d16, d16 -; CHECK-NEXT: vmov.u16 r0, d16[0] -; CHECK-NEXT: vorr d23, d16, d16 -; CHECK-NEXT: vmov.u16 r2, d17[3] -; CHECK-NEXT: vmov.u16 r3, d17[1] -; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vmov.u16 r1, d19[1] -; CHECK-NEXT: vuzp.16 d22, d23 -; CHECK-NEXT: vuzp.16 d22, d18 -; CHECK-NEXT: vmov.16 d20[0], r0 -; CHECK-NEXT: vmov.16 d20[1], r2 -; CHECK-NEXT: vmov.16 d20[2], r3 -; CHECK-NEXT: vmov.16 d20[3], r1 -; CHECK-NEXT: vext.16 d21, d16, d18, #3 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vorr d22, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vorr d23, d16, d16 +; CHECK-NEXT: vmov.u16 r2, d17[3] +; CHECK-NEXT: vmov.u16 r3, d17[1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vmov.u16 r1, d19[1] +; CHECK-NEXT: vuzp.16 d22, d23 +; CHECK-NEXT: vuzp.16 d22, d18 +; CHECK-NEXT: vmov.16 d20[0], r0 +; CHECK-NEXT: vmov.16 d20[1], r2 +; CHECK-NEXT: vmov.16 d20[2], r3 +; CHECK-NEXT: vmov.16 d20[3], r1 +; CHECK-NEXT: vext.16 d21, d16, d18, #3 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> Index: llvm/test/CodeGen/ARM/vpadd.ll =================================================================== --- llvm/test/CodeGen/ARM/vpadd.ll +++ llvm/test/CodeGen/ARM/vpadd.ll @@ -285,11 +285,11 @@ define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vext.8 d18, d16, d16, #1 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vext.8 d17, d16, d16, #1 ; CHECK-NEXT: vshl.i16 d16, d16, #8 -; CHECK-NEXT: vshl.i16 d18, d18, #8 -; CHECK-NEXT: vshr.s16 d17, d18, #8 +; CHECK-NEXT: vshl.i16 d17, d17, #8 +; CHECK-NEXT: vshr.s16 d17, d17, #8 ; CHECK-NEXT: vsra.s16 d17, d16, #8 ; CHECK-NEXT: vstr d17, [r1] ; CHECK-NEXT: mov pc, lr @@ -347,11 +347,11 @@ define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_u8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vext.8 d18, d16, d16, #1 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vext.8 d17, d16, d16, #1 ; CHECK-NEXT: vbic.i16 d16, #0xff00 -; CHECK-NEXT: vbic.i16 d18, #0xff00 -; CHECK-NEXT: vadd.i16 d16, d18, d16 +; CHECK-NEXT: vbic.i16 d17, #0xff00 +; CHECK-NEXT: vadd.i16 d16, d17, d16 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr %tmp = load <16 x i8>, <16 x i8>* %cbcr @@ -368,7 +368,7 @@ define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_u8_early_zext: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vmovl.u8 q8, d16 ; CHECK-NEXT: vpadd.i16 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] Index: llvm/test/CodeGen/ARM/vuzp.ll =================================================================== --- llvm/test/CodeGen/ARM/vuzp.ll +++ llvm/test/CodeGen/ARM/vuzp.ll @@ -522,12 +522,11 @@ define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { ; CHECK-LABEL: vuzp_extract_subvector: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vorr d18, d17, d17 -; CHECK-NEXT: vuzp.8 d16, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d18 +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vuzp.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> Index: llvm/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/test/CodeGen/X86/sse41.ll +++ llvm/test/CodeGen/X86/sse41.ll @@ -97,9 +97,8 @@ ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 -; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] -; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] +; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: pmovzxbq_1: @@ -122,9 +121,8 @@ ; X64-AVX512: ## %bb.0: ## %entry ; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] -; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] +; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: %0 = load i16, i16* @g16, align 2 ; [#uses=1]