Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13318,6 +13318,11 @@ if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) return SDValue(); + // Check operation result VT (there may be less support for a particular fp + // width). + if (!TLI.isOperationLegalOrCustom(Opcode, VT)) + return SDValue(); + // Just because the floating-point vector type is legal does not necessarily // mean that the corresponding integer vector type is. if (!isTypeLegal(NVT)) Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -449,6 +449,16 @@ setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::ROTL); + // Scalarize v2f32 early, to avoid later expansion to 4 operations (see + // comment in PerformDAGCombine). + SmallVector FP32Ops = + {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, ISD::FREM, ISD::SINT_TO_FP, + ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}; + for (auto Op : FP32Ops) { + setTargetDAGCombine(Op); + setOperationAction(Op, MVT::v2f32, Expand); + } + // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -5184,7 +5194,20 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch(N->getOpcode()) { - default: break; + default: + // Z13 can handle fp32 vectors in registers and memory, but does not + // support any vector operations on them. v2f32 is widened to v4f32 and + // kept in a single vector register, which is better when only memory + // operations are involved. Any operations on v2f32 should be scalarized + // before type legalization, or else all four operations will actually be + // emitted. + if (N->getValueType(0) == MVT::v2f32 || + ((N->getOpcode() == ISD::FP_TO_SINT || N->getOpcode() == ISD::FP_TO_UINT) && + (N->getOperand(0)->getValueType(0) == MVT::v2f32 && + N->getValueType(0) == MVT::v2i32))) + return DCI.DAG.UnrollVectorOp(N, 2); + + break; case ISD::SIGN_EXTEND: return combineSIGN_EXTEND(N, DCI); case SystemZISD::MERGE_HIGH: case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI); Index: test/CodeGen/SystemZ/fp32-vec-conv.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/fp32-vec-conv.ll @@ -0,0 +1,41 @@ +; Test that a vector of two floats only generates two instructions (and not +; four). +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + + +define <2 x float> @fun0(<2 x i32> %val1) { +; CHECK-LABEL: fun0: +; CHECK: celfbr +; CHECK: celfbr +; CHECK-NOT: celfbr + %z = uitofp <2 x i32> %val1 to <2 x float> + ret <2 x float> %z +} + +define <2 x float> @fun1(<2 x i32> %val1) { +; CHECK-LABEL: fun1: +; CHECK: cefbr +; CHECK: cefbr +; CHECK-NOT: cefbr + %z = sitofp <2 x i32> %val1 to <2 x float> + ret <2 x float> %z +} + +define <2 x i32> @fun2(<2 x float> %val1) { +; CHECK-LABEL: fun2: +; CHECK: cfebr +; CHECK: cfebr +; CHECK-NOT: cfebr + %z = fptosi <2 x float> %val1 to <2 x i32> + ret <2 x i32> %z +} + +define <2 x i32> @fun3(<2 x float> %val1) { +; CHECK-LABEL: fun3: +; CHECK: clfebr +; CHECK: clfebr +; CHECK-NOT: clfebr + %z = fptoui <2 x float> %val1 to <2 x i32> + ret <2 x i32> %z +} Index: test/CodeGen/SystemZ/fp32-vec-ops.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/fp32-vec-ops.ll @@ -0,0 +1,49 @@ +; Test that a vector of two floats only generates two instructions (and not +; four). +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define <2 x float> @fun0(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: fun0: +; CHECK: aebr +; CHECK: aebr +; CHECK-NOT: aebr + %ret = fadd <2 x float> %val1, %val2 + ret <2 x float> %ret +} + +define <2 x float> @fun1(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: fun1: +; CHECK: sebr +; CHECK: sebr +; CHECK-NOT: sebr + %ret = fsub <2 x float> %val1, %val2 + ret <2 x float> %ret +} + +define <2 x float> @fun2(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: fun2: +; CHECK: meebr +; CHECK: meebr +; CHECK-NOT: meebr + %ret = fmul <2 x float> %val1, %val2 + ret <2 x float> %ret +} + +define <2 x float> @fun3(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: fun3: +; CHECK: debr +; CHECK: debr +; CHECK-NOT: debr + %ret = fdiv <2 x float> %val1, %val2 + ret <2 x float> %ret +} + +define <2 x float> @fun4(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: fun4: +; CHECK: brasl %r14, fmodf@PLT +; CHECK: brasl %r14, fmodf@PLT +; CHECK-NOT: brasl %r14, fmodf@PLT + %ret = frem <2 x float> %val1, %val2 + ret <2 x float> %ret +} Index: test/CodeGen/X86/cvtv2f32.ll =================================================================== --- test/CodeGen/X86/cvtv2f32.ll +++ test/CodeGen/X86/cvtv2f32.ll @@ -8,26 +8,27 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector: ; X32: # BB#0: -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; X32-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; X32-NEXT: psrld $16, %xmm1 -; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7] -; X32-NEXT: addps {{\.LCPI.*}}, %xmm1 -; X32-NEXT: addps %xmm2, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: orpd %xmm2, %xmm1 +; X32-NEXT: subsd %xmm2, %xmm1 +; X32-NEXT: cvtsd2ss %xmm1, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: orpd %xmm2, %xmm3 +; X32-NEXT: subsd %xmm2, %xmm3 +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: cvtsd2ss %xmm3, %xmm2 +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: uitofp_2i32_buildvector: ; X64: # BB#0: -; X64-NEXT: movd %edi, %xmm1 -; X64-NEXT: pinsrd $1, %esi, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; X64-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; X64-NEXT: psrld $16, %xmm1 -; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7] -; X64-NEXT: addps {{.*}}(%rip), %xmm1 -; X64-NEXT: addps %xmm2, %xmm1 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cvtsi2ssq %rax, %xmm1 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cvtsi2ssq %rax, %xmm2 +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: retq %t1 = uitofp i32 %x to float Index: test_CodeGen_ARM_vdup.diff =================================================================== --- /dev/null +++ test_CodeGen_ARM_vdup.diff @@ -0,0 +1,98 @@ +--- vdup.trunk.s 2017-02-23 14:41:45.000000000 +0200 ++++ vdup.patch.s 2017-02-23 14:41:27.000000000 +0200 +@@ -588,76 +588,77 @@ + vmov r0, r1, d16 + mov pc, lr + .Lfunc_end35: + .size check_i8, .Lfunc_end35-check_i8 + .cantunwind + .fnend + + .globl check_spr_splat2 + .p2align 2 + .type check_spr_splat2,%function + .code 32 @ @check_spr_splat2 + check_spr_splat2: + .fnstart + @ BB#0: + lsl r2, r2, #16 +- vmov d17, r0, r1 ++ vmov d16, r0, r1 + asr r2, r2, #16 +- vdup.32 d16, r2 +- vcvt.f32.s32 d16, d16 +- vsub.f32 d16, d16, d17 ++ vmov s0, r2 ++ vcvt.f32.s32 s0, s0 ++ vdup.32 d17, d0[0] ++ vsub.f32 d16, d17, d16 + vmov r0, r1, d16 + mov pc, lr + .Lfunc_end36: + .size check_spr_splat2, .Lfunc_end36-check_spr_splat2 + .fnend + + .globl check_spr_splat4 + .p2align 2 + .type check_spr_splat4,%function + .code 32 @ @check_spr_splat4 + check_spr_splat4: + .fnstart + @ BB#0: +- mov r12, sp +- vmov d19, r2, r3 +- vld1.16 {d16[]}, [r12:16] +- vmov d18, r0, r1 +- vmovl.s16 q8, d16 +- vcvt.f32.s32 q8, q8 +- vsub.f32 q8, q8, q9 ++ ldrsh r12, [sp] ++ vmov d17, r2, r3 ++ vmov d16, r0, r1 ++ vmov s0, r12 ++ vcvt.f32.s32 s0, s0 ++ vdup.32 q9, d0[0] ++ vsub.f32 q8, q9, q8 + vmov r0, r1, d16 + vmov r2, r3, d17 + mov pc, lr + .Lfunc_end37: + .size check_spr_splat4, .Lfunc_end37-check_spr_splat4 + .fnend + + .globl check_spr_splat4_lane1 + .p2align 2 + .type check_spr_splat4_lane1,%function + .code 32 @ @check_spr_splat4_lane1 + check_spr_splat4_lane1: + .fnstart + @ BB#0: +- mov r12, sp +- vmov d19, r2, r3 +- vld1.16 {d16[]}, [r12:16] +- vmov d18, r0, r1 +- vmovl.s16 q8, d16 +- vcvt.f32.s32 q8, q8 +- vsub.f32 q8, q8, q9 ++ ldrsh r12, [sp] ++ vmov d17, r2, r3 ++ vmov d16, r0, r1 ++ vmov s0, r12 ++ vcvt.f32.s32 s0, s0 ++ vdup.32 q9, d0[0] ++ vsub.f32 q8, q9, q8 + vmov r0, r1, d16 + vmov r2, r3, d17 + mov pc, lr + .Lfunc_end38: + .size check_spr_splat4_lane1, .Lfunc_end38-check_spr_splat4_lane1 + .fnend + + .globl check_i8_varidx + .p2align 2 + .type check_i8_varidx,%function + .code 32 @ @check_i8_varidx + check_i8_varidx: + .fnstart + @ BB#0: + .save {r11} Index: test_CodeGen_X86_MLICMbug.diff =================================================================== --- /dev/null +++ test_CodeGen_X86_MLICMbug.diff @@ -0,0 +1,73 @@ +--- 2009-02-26-MachineLICMBug.trunk.s 2017-02-23 14:56:54.000000000 +0200 ++++ 2009-02-26-MachineLICMBug.patch.s 2017-02-23 14:56:45.000000000 +0200 +@@ -1,59 +1,43 @@ + .section __TEXT,__text,regular,pure_instructions + .macosx_version_min 10, 6 +- .section __TEXT,__literal16,16byte_literals +- .p2align 4 +-LCPI0_0: +- .long 1258291200 ## 0x4b000000 +- .long 1258291200 ## 0x4b000000 +- .long 1258291200 ## 0x4b000000 +- .long 1258291200 ## 0x4b000000 +-LCPI0_1: +- .long 1392508928 ## 0x53000000 +- .long 1392508928 ## 0x53000000 +- .long 1392508928 ## 0x53000000 +- .long 1392508928 ## 0x53000000 +-LCPI0_2: +- .long 3539992704 ## float -5.49764202E+11 +- .long 3539992704 ## float -5.49764202E+11 +- .long 3539992704 ## float -5.49764202E+11 +- .long 3539992704 ## float -5.49764202E+11 +- .section __TEXT,__text,regular,pure_instructions + .globl _t + .p2align 4, 0x90 + _t: ## @t + ## BB#0: ## %entry + pushq %r14 + pushq %rbx + pushq %rax + movq %rsi, %r14 + movq %rdi, %rbx + orq $2097152, %r14 ## imm = 0x200000 + andl $15728640, %r14d ## imm = 0xF00000 + jmp LBB0_1 + .p2align 4, 0x90 + LBB0_3: ## %bb.i + ## in Loop: Header=BB0_1 Depth=1 +- movd 0, %xmm0 ## xmm0 = mem[0],zero,zero,zero +- pinsrd $1, 4, %xmm0 +- pinsrd $2, 8, %xmm0 +- movdqa %xmm0, %xmm1 +- pblendw $170, LCPI0_0(%rip), %xmm1 ## xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7] +- psrld $16, %xmm0 +- pblendw $170, LCPI0_1(%rip), %xmm0 ## xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +- addps LCPI0_2(%rip), %xmm0 +- addps %xmm1, %xmm0 ++ movl 0, %eax ++ movl 4, %ecx ++ xorps %xmm0, %xmm0 ++ cvtsi2ssq %rax, %xmm0 ++ xorps %xmm1, %xmm1 ++ cvtsi2ssq %rcx, %xmm1 ++ movl 8, %eax ++ xorps %xmm2, %xmm2 ++ cvtsi2ssq %rax, %xmm2 ++ insertps $16, %xmm1, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ++ insertps $32, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[0],xmm0[3] + movaps %xmm0, 0 + LBB0_1: ## %bb4 + ## =>This Inner Loop Header: Depth=1 + xorl %eax, %eax + callq _xxGetOffsetForCode + xorl %esi, %esi + xorl %eax, %eax + movq %rbx, %rdi + callq _xxCalculateMidType + cmpl $1, %eax + jne LBB0_1 + ## BB#2: ## %bb26 + ## in Loop: Header=BB0_1 Depth=1 + cmpq $1048576, %r14 ## imm = 0x100000 + jne LBB0_1