Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -401,8 +401,9 @@ /// efficiently, casting the load to a smaller vector of larger types and /// loading is more efficient, however, this can be undone by optimizations in /// dag combiner. - virtual bool isLoadBitCastBeneficial(EVT LoadVT, - EVT BitcastVT) const { + virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { // Don't do if we could do an indexed load on the original type, but not on // the new one. if (!LoadVT.isSimple() || !BitcastVT.isSimple()) @@ -416,14 +417,18 @@ getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT()) return false; - return true; + bool Fast = false; + return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT, + MMO, &Fast) && Fast; } /// Return true if the following transform is beneficial: /// (store (y (conv x)), y*)) -> (store x, (x*)) - virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const { + virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { // Default to the same logic as loads. - return isLoadBitCastBeneficial(StoreVT, BitcastVT); + return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO); } /// Return true if it is expected to be cheaper to do a store of a non-zero Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11040,14 +11040,11 @@ // as we assume software couldn't rely on the number of accesses of an // illegal type. ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isOperationLegal(ISD::LOAD, VT)) && - TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { + TLI.isOperationLegal(ISD::LOAD, VT))) { LoadSDNode *LN0 = cast(N0); - bool Fast = false; - if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *LN0->getMemOperand(), &Fast) && - Fast) { + if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, + *LN0->getMemOperand())) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->getAlignment(), @@ -16174,15 +16171,11 @@ // illegal type. if (((!LegalOperations && !ST->isVolatile()) || TLI.isOperationLegal(ISD::STORE, SVT)) && - TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { - bool Fast = false; - if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, - *ST->getMemOperand(), &Fast) && - Fast) { - return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, - ST->getPointerInfo(), ST->getAlignment(), - ST->getMemOperand()->getFlags(), ST->getAAInfo()); - } + TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, + DAG, *ST->getMemOperand())) { + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getPointerInfo(), ST->getAlignment(), + ST->getMemOperand()->getFlags(), ST->getAAInfo()); } } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -182,7 +182,8 @@ ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const final; + bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, + const MachineMemOperand &MMO) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -719,8 +719,9 @@ return (OldSize < 32); } -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, - EVT CastTy) const { +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); @@ -730,8 +731,12 @@ unsigned LScalarSize = LoadTy.getScalarSizeInBits(); unsigned CastScalarSize = CastTy.getScalarSizeInBits(); - return (LScalarSize < CastScalarSize) || - (CastScalarSize >= 32); + if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) + return false; + + bool Fast = false; + return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, + MMO, &Fast) && Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -1127,7 +1127,9 @@ return NumElem > 2; } - bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override; + bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const override; /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4941,8 +4941,9 @@ return Subtarget.hasLZCNT(); } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, - EVT BitcastVT) const { +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; @@ -4950,7 +4951,12 @@ if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; - return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); + // If both types are legal vectors, it's always ok to convert them. + if (LoadVT.isVector() && BitcastVT.isVector() && + isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) + return true; + + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -306,27 +306,25 @@ ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, (%eax) -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 4(%eax) ; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 16(%eax) -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 20(%eax) ; X86-SSE2-NEXT: retl ; @@ -421,27 +419,25 @@ ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, (%eax) -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 4(%eax) ; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 16(%eax) -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 20(%eax) ; X86-SSE2-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: shuffle_v4i32_0001: