diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5836,7 +5836,7 @@ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; - if (ScalarTy->isHalfTy() && ST->hasBWI()) + if (ScalarTy->is16bitFPTy() && ST->hasBWI()) return true; if (!ScalarTy->isIntegerTy()) @@ -6299,7 +6299,7 @@ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->is16bitFPTy()) return HasBW; return false; }; diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -581,3 +581,581 @@ ; BF16-NEXT: retq ret <32 x bfloat> zeroinitializer } + +define <32 x bfloat> @pr63017_2() nounwind { +; SSE2-LABEL: pr63017_2: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_1 +; SSE2-NEXT: # %bb.2: # %cond.load +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: jmp .LBB12_3 +; SSE2-NEXT: .LBB12_1: +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: .LBB12_3: # %else +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_5 +; SSE2-NEXT: # %bb.4: # %cond.load1 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: .LBB12_5: # %else2 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_6 +; SSE2-NEXT: # %bb.7: # %cond.load4 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm1, %xmm14 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: jmp .LBB12_8 +; SSE2-NEXT: .LBB12_6: +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm1, %xmm14 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: .LBB12_8: # %else5 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_10 +; SSE2-NEXT: # %bb.9: # %cond.load7 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_10: # %else8 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_12 +; SSE2-NEXT: # %bb.11: # %cond.load10 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_12: # %else11 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_14 +; SSE2-NEXT: # %bb.13: # %cond.load13 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_14: # %else14 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_16 +; SSE2-NEXT: # %bb.15: # %cond.load16 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_16: # %else17 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_18 +; SSE2-NEXT: # %bb.17: # %cond.load19 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_18: # %else20 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_20 +; SSE2-NEXT: # %bb.19: # %cond.load22 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_20: # %else23 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_22 +; SSE2-NEXT: # %bb.21: # %cond.load25 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_22: # %else26 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_24 +; SSE2-NEXT: # %bb.23: # %cond.load28 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_24: # %else29 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_26 +; SSE2-NEXT: # %bb.25: # %cond.load31 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_26: # %else32 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_28 +; SSE2-NEXT: # %bb.27: # %cond.load34 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_28: # %else35 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_30 +; SSE2-NEXT: # %bb.29: # %cond.load37 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_30: # %else38 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_32 +; SSE2-NEXT: # %bb.31: # %cond.load40 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_32: # %else41 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_34 +; SSE2-NEXT: # %bb.33: # %cond.load43 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_34: # %else44 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_36 +; SSE2-NEXT: # %bb.35: # %cond.load46 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_36: # %else47 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_38 +; SSE2-NEXT: # %bb.37: # %cond.load49 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_38: # %else50 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_40 +; SSE2-NEXT: # %bb.39: # %cond.load52 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: .LBB12_40: # %else53 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_42 +; SSE2-NEXT: # %bb.41: # %cond.load55 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: .LBB12_42: # %else56 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_44 +; SSE2-NEXT: # %bb.43: # %cond.load58 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: .LBB12_44: # %else59 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_46 +; SSE2-NEXT: # %bb.45: # %cond.load61 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: .LBB12_46: # %else62 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_48 +; SSE2-NEXT: # %bb.47: # %cond.load64 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: .LBB12_48: # %else65 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_50 +; SSE2-NEXT: # %bb.49: # %cond.load67 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: .LBB12_50: # %else68 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_52 +; SSE2-NEXT: # %bb.51: # %cond.load70 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: .LBB12_52: # %else71 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_54 +; SSE2-NEXT: # %bb.53: # %cond.load73 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: .LBB12_54: # %else74 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_56 +; SSE2-NEXT: # %bb.55: # %cond.load76 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: .LBB12_56: # %else77 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_58 +; SSE2-NEXT: # %bb.57: # %cond.load79 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: .LBB12_58: # %else80 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_60 +; SSE2-NEXT: # %bb.59: # %cond.load82 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: .LBB12_60: # %else83 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_62 +; SSE2-NEXT: # %bb.61: # %cond.load85 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: .LBB12_62: # %else86 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_64 +; SSE2-NEXT: # %bb.63: # %cond.load88 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: .LBB12_64: # %else89 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jne .LBB12_65 +; SSE2-NEXT: # %bb.66: # %cond.load91 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: jmp .LBB12_67 +; SSE2-NEXT: .LBB12_65: +; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: .LBB12_67: # %else92 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: addq $200, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: retq +; +; BF16-LABEL: pr63017_2: +; BF16: # %bb.0: +; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; BF16-NEXT: retq + %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> ) + ret <32 x bfloat> %1 +} + +declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)