diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1501,6 +1501,14 @@ EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized()) { + SDValue Value, NewChain; + std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Value, dl); + ReplaceValueWith(SDValue(LD, 1), NewChain); + return; + } + Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset, LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo); @@ -3664,6 +3672,56 @@ LoadSDNode *LD = cast(N); ISD::LoadExtType ExtType = LD->getExtensionType(); + // A vector must always be stored in memory as-is, i.e. without any padding + // between the elements, since various code depend on it, e.g. in the + // handling of a bitcast of a vector type to int, which may be done with a + // vector store followed by an integer load. A vector that does not have + // elements that are byte-sized must therefore be stored as an integer + // built out of the extracted vector elements. + if (!LD->getMemoryVT().isByteSized()) { + SDLoc SL(LD); + EVT SrcVT = LD->getMemoryVT(); + EVT DstVT = LD->getValueType(0); + ISD::LoadExtType ExtType = LD->getExtensionType(); + + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), DstVT); + + unsigned NumElem = SrcVT.getVectorNumElements(); + unsigned WidenNumElem = WidenVT.getVectorNumElements(); + + EVT SrcEltVT = SrcVT.getScalarType(); + EVT DstEltVT = DstVT.getScalarType(); + + unsigned NumBits = SrcVT.getSizeInBits(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits); + + SDValue Load = DAG.getLoad(IntVT, SL, LD->getChain(), LD->getBasePtr(), + LD->getPointerInfo(), + LD->getAlignment(), + LD->getMemOperand()->getFlags(), + LD->getAAInfo()); + + SmallVector Vals(WidenNumElem, DAG.getUNDEF(DstEltVT)); + for (unsigned Idx = 0; Idx < NumElem; ++Idx) { + unsigned ShiftIntoIdx = + (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx); + SDValue ShiftAmount = + DAG.getConstant(ShiftIntoIdx * SrcEltVT.getSizeInBits(), SL, IntVT); + SDValue ShiftedElt = + DAG.getNode(ISD::SRL, SL, IntVT, Load, ShiftAmount); + SDValue Scalar = DAG.getNode(ISD::TRUNCATE, SL, SrcEltVT, ShiftedElt); + if (ExtType != ISD::NON_EXTLOAD) { + unsigned ExtendOp = ISD::getExtForLoadExtType(false, ExtType); + Scalar = DAG.getNode(ExtendOp, SL, DstEltVT, Scalar); + } + Vals[Idx] = Scalar; + } + + SDValue Value = DAG.getBuildVector(WidenVT, SL, Vals); + ReplaceValueWith(SDValue(LD, 1), Load.getValue(1)); + return Value; + } + SDValue Result; SmallVector LdChain; // Chain for the series of load if (ExtType != ISD::NON_EXTLOAD) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6578,12 +6578,48 @@ SDValue Chain = LD->getChain(); SDValue BasePTR = LD->getBasePtr(); EVT SrcVT = LD->getMemoryVT(); + EVT DstVT = LD->getValueType(0); ISD::LoadExtType ExtType = LD->getExtensionType(); unsigned NumElem = SrcVT.getVectorNumElements(); EVT SrcEltVT = SrcVT.getScalarType(); - EVT DstEltVT = LD->getValueType(0).getScalarType(); + EVT DstEltVT = DstVT.getScalarType(); + + // A vector must always be stored in memory as-is, i.e. without any padding + // between the elements, since various code depend on it, e.g. in the + // handling of a bitcast of a vector type to int, which may be done with a + // vector store followed by an integer load. A vector that does not have + // elements that are byte-sized must therefore be stored as an integer + // built out of the extracted vector elements. + if (!SrcEltVT.isByteSized()) { + unsigned NumBits = SrcVT.getSizeInBits(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits); + + SDValue Load = DAG.getLoad(IntVT, SL, Chain, BasePTR, LD->getPointerInfo(), + LD->getAlignment(), + LD->getMemOperand()->getFlags(), + LD->getAAInfo()); + + SmallVector Vals; + for (unsigned Idx = 0; Idx < NumElem; ++Idx) { + unsigned ShiftIntoIdx = + (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx); + SDValue ShiftAmount = + DAG.getConstant(ShiftIntoIdx * SrcEltVT.getSizeInBits(), SL, IntVT); + SDValue ShiftedElt = + DAG.getNode(ISD::SRL, SL, IntVT, Load, ShiftAmount); + SDValue Scalar = DAG.getNode(ISD::TRUNCATE, SL, SrcEltVT, ShiftedElt); + if (ExtType != ISD::NON_EXTLOAD) { + unsigned ExtendOp = ISD::getExtForLoadExtType(false, ExtType); + Scalar = DAG.getNode(ExtendOp, SL, DstEltVT, Scalar); + } + Vals.push_back(Scalar); + } + + SDValue Value = DAG.getBuildVector(DstVT, SL, Vals); + return std::make_pair(Value, Load.getValue(1)); + } unsigned Stride = SrcEltVT.getSizeInBits() / 8; assert(SrcEltVT.isByteSized()); @@ -6605,7 +6641,7 @@ } SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains); - SDValue Value = DAG.getBuildVector(LD->getValueType(0), SL, Vals); + SDValue Value = DAG.getBuildVector(DstVT, SL, Vals); return std::make_pair(Value, NewChain); } diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -121,15 +121,19 @@ define void @fun3(<3 x i31>* %src, <3 x i31>* %p) ; CHECK-LABEL: fun3: ; CHECK: # %bb.0: -; CHECK-NEXT: llgf %r0, 3(%r2) -; CHECK-NEXT: llgf %r1, 6(%r2) -; CHECK-NEXT: llgf %r2, 0(%r2) -; CHECK-NEXT: rosbg %r1, %r0, 0, 32, 31 -; CHECK-NEXT: sllg %r4, %r2, 62 -; CHECK-NEXT: rosbg %r4, %r0, 0, 32, 31 -; CHECK-NEXT: srlg %r0, %r4, 32 +; CHECK-NEXT: llgf %r0, 8(%r2) +; CHECK-NEXT: lg %r1, 0(%r2) +; CHECK-NEXT: sllg %r2, %r1, 32 +; CHECK-NEXT: lr %r2, %r0 +; CHECK-NEXT: srlg %r4, %r2, 62 +; CHECK-NEXT: rosbg %r4, %r1, 33, 61, 34 +; CHECK-NEXT: risbgn %r1, %r2, 2, 160, 0 +; CHECK-NEXT: rosbg %r1, %r0, 33, 63, 0 +; CHECK-NEXT: sllg %r0, %r4, 62 +; CHECK-NEXT: rosbg %r0, %r2, 2, 32, 0 +; CHECK-NEXT: srlg %r0, %r0, 32 ; CHECK-NEXT: st %r1, 8(%r3) -; CHECK-NEXT: sllg %r1, %r2, 30 +; CHECK-NEXT: sllg %r1, %r4, 30 ; CHECK-NEXT: lr %r1, %r0 ; CHECK-NEXT: nihh %r1, 8191 ; CHECK-NEXT: stg %r1, 0(%r3) diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -96,28 +96,22 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movq %rdi, %r14 -; CHECK-NEXT: movzbl (%rdx), %ebp -; CHECK-NEXT: movl %ebp, %eax -; CHECK-NEXT: shrl %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movl %ebp, %ecx -; CHECK-NEXT: andl $1, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pinsrd $1, %eax, %xmm0 -; CHECK-NEXT: shrl $2, %ebp -; CHECK-NEXT: andl $1, %ebp -; CHECK-NEXT: pinsrd $2, %ebp, %xmm0 -; CHECK-NEXT: movd %xmm0, %ebx -; CHECK-NEXT: pextrd $1, %xmm0, %r15d +; CHECK-NEXT: movb (%rdx), %al +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrb $2, %cl +; CHECK-NEXT: movzbl %cl, %r15d +; CHECK-NEXT: movzbl %al, %ebx +; CHECK-NEXT: shrb %al +; CHECK-NEXT: movzbl %al, %ebp ; CHECK-NEXT: movq %rsi, %rdi ; CHECK-NEXT: movl %ebx, %esi -; CHECK-NEXT: movl %r15d, %edx -; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: callq masked_load_v3 ; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: movl %ebx, %esi -; CHECK-NEXT: movl %r15d, %edx -; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: callq masked_store4_v3 ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/pr42803.ll b/llvm/test/CodeGen/X86/pr42803.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr42803.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; ModuleID = 'd' +; RUN: llc < %s -O0 -mattr=-sse | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +@0 = internal unnamed_addr constant [3 x i1] [i1 false, i1 true, i1 false], align 1 + +declare fastcc void @panic() + +define void @_start() { +; CHECK-LABEL: _start: +; CHECK: # %bb.0: # %Entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movb __unnamed_1+{{.*}}(%rip), %al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw {{.*}}(%rip), %cx +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-NEXT: addb %sil, %sil +; CHECK-NEXT: orb %sil, %dl +; CHECK-NEXT: shlb $2, %al +; CHECK-NEXT: orb %al, %dl +; CHECK-NEXT: andb $7, %dl +; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-NEXT: movb %al, %dl +; CHECK-NEXT: shrb %dl +; CHECK-NEXT: movzbl %al, %edi +; CHECK-NEXT: btl $1, %edi +; CHECK-NEXT: jb .LBB0_2 +; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: .LBB0_1: # %Then +; CHECK-NEXT: callq panic +; CHECK-NEXT: .LBB0_2: # %Else +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_3: # %EndIf +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +Entry: + %a = alloca [3 x i1], align 1 + %x2 = alloca <3 x i1>, align 4 + %0 = bitcast [3 x i1]* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 bitcast ([3 x i1]* @0 to i8*), i64 3, i1 false) + %1 = load [3 x i1], [3 x i1]* %a + %vector_to_array = extractvalue [3 x i1] %1, 0 + %2 = insertelement <3 x i1> undef, i1 %vector_to_array, i32 0 + %vector_to_array1 = extractvalue [3 x i1] %1, 1 + %3 = insertelement <3 x i1> %2, i1 %vector_to_array1, i32 1 + %vector_to_array2 = extractvalue [3 x i1] %1, 2 + %4 = insertelement <3 x i1> %3, i1 %vector_to_array2, i32 2 + store <3 x i1> %4, <3 x i1>* %x2, align 4 + %5 = load <3 x i1>, <3 x i1>* %x2, align 4 + %6 = extractelement <3 x i1> %5, i32 1 + %7 = icmp ne i1 %6, true + br i1 %7, label %Then, label %Else + +Then: ; preds = %Entry + tail call fastcc void @panic() + unreachable + +Else: ; preds = %Entry + br label %EndIf + +EndIf: ; preds = %Else + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) diff --git a/llvm/test/CodeGen/X86/pr44902.ll b/llvm/test/CodeGen/X86/pr44902.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr44902.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +; ModuleID = 'foo' +source_filename = "foo" +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i386-unknown-windows-msvc" + +@0 = internal unnamed_addr constant <4 x i1> , align 4 + +define void @fun(<4 x i1>) { +; CHECK-LABEL: fun: +; CHECK: # %bb.0: # %Entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addb %al, %al +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: shlb $2, %cl +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addb %al, %al +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %dl +; CHECK-NEXT: andb $1, %dl +; CHECK-NEXT: orb %dl, %al +; CHECK-NEXT: andb $3, %al +; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: andb $15, %al +; CHECK-NEXT: movb %al, (%esp) +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: jne LBB0_2 +; CHECK-NEXT: # %bb.1: # %Then +; CHECK-NEXT: int3 +; CHECK-NEXT: LBB0_2: # %EndIf +; CHECK-NEXT: movzbl (%esp), %eax +; CHECK-NEXT: btl $1, %eax +; CHECK-NEXT: jae LBB0_4 +; CHECK-NEXT: # %bb.3: # %Then1 +; CHECK-NEXT: int3 +; CHECK-NEXT: LBB0_4: # %EndIf3 +; CHECK-NEXT: movzbl (%esp), %eax +; CHECK-NEXT: btl $2, %eax +; CHECK-NEXT: jb LBB0_6 +; CHECK-NEXT: # %bb.5: # %Then4 +; CHECK-NEXT: int3 +; CHECK-NEXT: LBB0_6: # %EndIf6 +; CHECK-NEXT: testb $8, (%esp) +; CHECK-NEXT: je LBB0_8 +; CHECK-NEXT: # %bb.7: # %Then7 +; CHECK-NEXT: int3 +; CHECK-NEXT: LBB0_8: # %EndIf9 +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +Entry: + %x = alloca <4 x i1>, align 4 + store <4 x i1> %0, <4 x i1>* %x, align 4 + %1 = load <4 x i1>, <4 x i1>* %x + %2 = extractelement <4 x i1> %1, i32 0 + %3 = icmp ne i1 %2, true + br i1 %3, label %Then, label %Else + +Then: ; preds = %Entry + call void @llvm.debugtrap() + br label %EndIf + +Else: ; preds = %Entry + br label %EndIf + +EndIf: ; preds = %Else, %Then + %4 = load <4 x i1>, <4 x i1>* %x + %5 = extractelement <4 x i1> %4, i32 1 + %6 = icmp ne i1 %5, false + br i1 %6, label %Then1, label %Else2 + +Then1: ; preds = %EndIf + call void @llvm.debugtrap() + br label %EndIf3 + +Else2: ; preds = %EndIf + br label %EndIf3 + +EndIf3: ; preds = %Else2, %Then1 + %7 = load <4 x i1>, <4 x i1>* %x + %8 = extractelement <4 x i1> %7, i32 2 + %9 = icmp ne i1 %8, true + br i1 %9, label %Then4, label %Else5 + +Then4: ; preds = %EndIf3 + call void @llvm.debugtrap() + br label %EndIf6 + +Else5: ; preds = %EndIf3 + br label %EndIf6 + +EndIf6: ; preds = %Else5, %Then4 + %10 = load <4 x i1>, <4 x i1>* %x + %11 = extractelement <4 x i1> %10, i32 3 + %12 = icmp ne i1 %11, false + br i1 %12, label %Then7, label %Else8 + +Then7: ; preds = %EndIf6 + call void @llvm.debugtrap() + br label %EndIf9 + +Else8: ; preds = %EndIf6 + br label %EndIf9 + +EndIf9: ; preds = %Else8, %Then7 + ret void +} + +; Function Attrs: nounwind +declare void @llvm.debugtrap()