diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,13 +48,15 @@ VEX, T8XD; // Pseduo instruction for RA. - def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + let isReMaterializable = 1, canFoldAsLoad = 1 in + def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; - def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in + def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2), []>; let usesCustomInserter = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1006,6 +1006,8 @@ case X86::MOV64ri: case X86::MOV64ri32: case X86::MOV8ri: + case X86::PTILEZEROV: + case X86::PTILELOADDV: return true; case X86::MOV8rm: diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -131,13 +131,10 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%r13,%r12) ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll --- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -95,7 +95,7 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) @@ -105,11 +105,9 @@ ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, %r14w -; CHECK-NEXT: tilezero %tmm3 +; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 @@ -120,28 +118,20 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload +; CHECK-NEXT: tilezero %tmm2 ; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 -; CHECK-NEXT: # implicit-def: $rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) ; CHECK-NEXT: incl %ebp ; CHECK-NEXT: cmpw $100, %bp ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit -; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -120,7 +120,7 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $3032, %rsp # imm = 0xBD8 +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) @@ -143,10 +143,9 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) ; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) @@ -155,13 +154,12 @@ ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: incl %ebp ; CHECK-NEXT: cmpw $100, %bp ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit -; CHECK-NEXT: addq $3032, %rsp # imm = 0xBD8 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15