diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18396,6 +18396,15 @@ Value.getValueType().isInteger() && (!isa(Value) || !cast(Value)->isOpaque())) { + // Convert a truncating store of a extension into a standard store. + if ((Value.getOpcode() == ISD::ZERO_EXTEND || + Value.getOpcode() == ISD::SIGN_EXTEND || + Value.getOpcode() == ISD::ANY_EXTEND) && + Value.getOperand(0).getValueType() == ST->getMemoryVT() && + TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT())) + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getMemOperand()); + APInt TruncDemandedBits = APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48170,7 +48170,8 @@ St->getValue().getOperand(0).getValueType() == MVT::v16i16 && TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { - SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, + St->getValue().getOperand(0)); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -543,17 +543,16 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1