diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -300,11 +300,13 @@ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, + { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, @@ -357,6 +359,8 @@ { X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE }, + { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -367,6 +371,8 @@ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5526,6 +5526,10 @@ if (I != nullptr) { unsigned Opcode = I->DstOp; + bool FoldedLoad = + isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0; + bool FoldedStore = + isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE); MaybeAlign MinAlign = decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); if (MinAlign && Alignment < *MinAlign) @@ -5536,20 +5540,25 @@ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size < RCSize) { - // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. - // Check if it's safe to fold the load. If the size of the object is - // narrower than the load width, then it's not. - if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) - return nullptr; + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. + if (FoldedLoad && Size < RCSize) { // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is // due to live interval analysis remat'ing a load from stack slot. + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return nullptr; if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } + // For stores, make sure the size of the object is equal to the size of + // the store. If the object is larger, the extra bits would be garbage. If + // the object is smaller we might overwrite another object or fault. + if (FoldedStore && Size != RCSize) + return nullptr; } if (isTwoAddrFold) diff --git a/llvm/test/CodeGen/X86/pr47874.ll b/llvm/test/CodeGen/X86/pr47874.ll --- a/llvm/test/CodeGen/X86/pr47874.ll +++ b/llvm/test/CodeGen/X86/pr47874.ll @@ -9,8 +9,7 @@ ; SSE2-NEXT: testl %esi, %esi ; SSE2-NEXT: jle LBB0_3 ; SSE2-NEXT: ## %bb.1: ## %bb2 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE2-NEXT: movl %esi, %eax ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: LBB0_2: ## %bb6 @@ -31,8 +30,7 @@ ; AVX-NEXT: testl %esi, %esi ; AVX-NEXT: jle LBB0_3 ; AVX-NEXT: ## %bb.1: ## %bb2 -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; AVX-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; AVX-NEXT: movl %esi, %eax ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: LBB0_2: ## %bb6 @@ -78,8 +76,7 @@ ; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: jle LBB1_3 ; SSE2-NEXT: ## %bb.1: ## %bb2 -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: LBB1_2: ## %bb6 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -99,8 +96,7 @@ ; AVX-NEXT: testq %rsi, %rsi ; AVX-NEXT: jle LBB1_3 ; AVX-NEXT: ## %bb.1: ## %bb2 -; AVX-NEXT: vmovq %rsi, %xmm0 -; AVX-NEXT: vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill +; AVX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: LBB1_2: ## %bb6 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1