Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -311,6 +311,12 @@ // Operand 5: the width of the field in bits (8 or 16) ATOMIC_CMP_SWAPW, + // Byte swap load + LRV, + + // Byte swap store + STRV, + // Prefetch from the second operand using the 4-bit control code in // the first operand. The code is 1 for a load prefetch and 2 for // a store prefetch. Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -435,6 +435,7 @@ setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::BSWAP); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -4637,6 +4638,8 @@ OPCODE(ATOMIC_LOADW_UMIN); OPCODE(ATOMIC_LOADW_UMAX); OPCODE(ATOMIC_CMP_SWAPW); + OPCODE(LRV); + OPCODE(STRV); OPCODE(PREFETCH); } return nullptr; @@ -4932,6 +4935,72 @@ } } } + + // Combine BSWAP (LOAD) into LRVH/LRV/LRVG + if (Opcode == ISD::BSWAP && + ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && + N->getOperand(0).hasOneUse() && + (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 || + N->getValueType(0) == MVT::i64) && + !cast(N->getOperand(0))->isVolatile()) { + SDValue Load = N->getOperand(0); + LoadSDNode *LD = cast(Load); + + // Create the byte-swapping load. + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(N->getValueType(0)) // VT + }; + SDValue BSLoad = + DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N), + DAG.getVTList(N->getValueType(0) == MVT::i64 ? + MVT::i64 : MVT::i32, MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + + // If this is an i16 load, insert the truncate. + SDValue ResVal = BSLoad; + if (N->getValueType(0) == MVT::i16) + ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad); + + // First, combine the bswap away. This makes the value produced by the + // load dead. + DCI.CombineTo(N, ResVal); + + // Next, combine the load away, we give it a bogus result value but a real + // chain result. The result value is dead because the bswap is dead. + DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); + + // Return N so it doesn't get rechecked! + return SDValue(N, 0); + } + + // Combine STORE (BSWAP) into STRVH/STRV/STRVG + if (Opcode == ISD::STORE && + cast(N)->isUnindexed() && + !cast(N)->isVolatile() && + N->getOperand(1).getOpcode() == ISD::BSWAP && + N->getOperand(1).getNode()->hasOneUse() && + (N->getOperand(1).getValueType() == MVT::i16 || + N->getOperand(1).getValueType() == MVT::i32 || + N->getOperand(1).getValueType() == MVT::i64)) { + + SDValue BSwapOp = N->getOperand(1).getOperand(0); + + if (BSwapOp.getValueType() == MVT::i16) + BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp); + + SDValue Ops[] = { + N->getOperand(0), BSwapOp, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + + return + DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other), + Ops, cast(N)->getMemoryVT(), + cast(N)->getMemOperand()); + } + return SDValue(); } Index: lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.td +++ lib/Target/SystemZ/SystemZInstrInfo.td @@ -663,13 +663,14 @@ // Byte-swapping loads. Unlike normal loads, these instructions are // allowed to access storage more than once. -def LRV : UnaryRXY<"lrv", 0xE31E, loadu, GR32, 4>; -def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu, GR64, 8>; +def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>; +def LRV : UnaryRXY<"lrv", 0xE31E, z_lrv, GR32, 4>; +def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>; // Likewise byte-swapping stores. -def STRV : StoreRXY<"strv", 0xE33E, storeu, GR32, 4>; -def STRVG : StoreRXY<"strvg", 0xE32F, storeu, - GR64, 8>; +def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>; +def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>; +def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>; //===----------------------------------------------------------------------===// // Load address instructions Index: lib/Target/SystemZ/SystemZOperators.td =================================================================== --- lib/Target/SystemZ/SystemZOperators.td +++ lib/Target/SystemZ/SystemZOperators.td @@ -79,6 +79,14 @@ def SDT_ZPrefetch : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; +def SDT_ZLoadBSwap : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDT_ZStoreBSwap : SDTypeProfile<0, 3, + [SDTCisInt<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; def SDT_ZTBegin : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -191,6 +199,11 @@ def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone, [SDNPHasChain, SDNPSideEffect]>; +def z_loadbswap : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap, + [SDNPHasChain, SDNPMayLoad]>; +def z_storebswap : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap, + [SDNPHasChain, SDNPMayStore]>; + // Defined because the index is an i32 rather than a pointer. def z_vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT", SDT_ZInsertVectorElt>; @@ -331,6 +344,17 @@ // Pattern fragments //===----------------------------------------------------------------------===// +def z_lrvh : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>; +def z_lrv : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>; +def z_lrvg : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>; + +def z_strvh : PatFrag<(ops node:$src, node:$addr), + (z_storebswap node:$src, node:$addr, i16)>; +def z_strv : PatFrag<(ops node:$src, node:$addr) + (z_storebswap node:$src, node:$addr, i32)>; +def z_strvg : PatFrag<(ops node:$src, node:$addr) + (z_storebswap node:$src, node:$addr, i64)>; + // Signed and unsigned comparisons. def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{ unsigned Type = cast(N->getOperand(2))->getZExtValue(); Index: test/CodeGen/SystemZ/bswap-06.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/bswap-06.ll @@ -0,0 +1,99 @@ +; Test 16-bit byteswaps from memory to registers. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare i16 @llvm.bswap.i16(i16 %a) + +; Check LRVH with no displacement. +define i16 @f1(i16 *%src) { +; CHECK-LABEL: f1: +; CHECK: lrvh %r2, 0(%r2) +; CHECK: br %r14 + %a = load i16 , i16 *%src + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check the high end of the aligned LRVH range. +define i16 @f2(i16 *%src) { +; CHECK-LABEL: f2: +; CHECK: lrvh %r2, 524286(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262143 + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define i16 @f3(i16 *%src) { +; CHECK-LABEL: f3: +; CHECK: agfi %r2, 524288 +; CHECK: lrvh %r2, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262144 + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check the high end of the negative aligned LRVH range. +define i16 @f4(i16 *%src) { +; CHECK-LABEL: f4: +; CHECK: lrvh %r2, -2(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -1 + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check the low end of the LRVH range. +define i16 @f5(i16 *%src) { +; CHECK-LABEL: f5: +; CHECK: lrvh %r2, -524288(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262144 + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check the next word down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i16 @f6(i16 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r2, -524290 +; CHECK: lrvh %r2, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262145 + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check that LRVH allows an index. +define i16 @f7(i64 %src, i64 %index) { +; CHECK-LABEL: f7: +; CHECK: lrvh %r2, 524287({{%r3,%r2|%r2,%r3}}) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524287 + %ptr = inttoptr i64 %add2 to i16 * + %a = load i16 , i16 *%ptr + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} + +; Check that volatile accesses do not use LRVH, which might access the +; storage multple times. +define i16 @f8(i16 *%src) { +; CHECK-LABEL: f8: +; CHECK: lh [[REG:%r[0-5]]], 0(%r2) +; CHECK: lrvr %r2, [[REG]] +; CHECK: br %r14 + %a = load volatile i16 , i16 *%src + %swapped = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %swapped +} Index: test/CodeGen/SystemZ/bswap-07.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/bswap-07.ll @@ -0,0 +1,100 @@ +; Test 32-bit byteswaps from registers to memory. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare i16 @llvm.bswap.i16(i16 %a) + +; Check STRVH with no displacement. +define void @f1(i16 *%dst, i16 %a) { +; CHECK-LABEL: f1: +; CHECK: strvh %r3, 0(%r2) +; CHECK: br %r14 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%dst + ret void +} + +; Check the high end of the aligned STRVH range. +define void @f2(i16 *%dst, i16 %a) { +; CHECK-LABEL: f2: +; CHECK: strvh %r3, 524286(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%dst, i64 262143 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define void @f3(i16 *%dst, i16 %a) { +; CHECK-LABEL: f3: +; CHECK: agfi %r2, 524288 +; CHECK: strvh %r3, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%dst, i64 262144 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check the high end of the negative aligned STRVH range. +define void @f4(i16 *%dst, i16 %a) { +; CHECK-LABEL: f4: +; CHECK: strvh %r3, -2(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%dst, i64 -1 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check the low end of the STRVH range. +define void @f5(i16 *%dst, i16 %a) { +; CHECK-LABEL: f5: +; CHECK: strvh %r3, -524288(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%dst, i64 -262144 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check the next word down, which needs separate address logic. +; Other sequences besides this one would be OK. +define void @f6(i16 *%dst, i16 %a) { +; CHECK-LABEL: f6: +; CHECK: agfi %r2, -524290 +; CHECK: strvh %r3, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%dst, i64 -262145 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check that STRVH allows an index. +define void @f7(i64 %src, i64 %index, i16 %a) { +; CHECK-LABEL: f7: +; CHECK: strvh %r4, 524287({{%r3,%r2|%r2,%r3}}) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524287 + %ptr = inttoptr i64 %add2 to i16 * + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store i16 %swapped, i16 *%ptr + ret void +} + +; Check that volatile stores do not use STRVH, which might access the +; storage multple times. +define void @f8(i16 *%dst, i16 %a) { +; CHECK-LABEL: f8: +; CHECK: lrvr [[REG:%r[0-5]]], %r3 +; CHECK: srl [[REG]], 16 +; CHECK: sth [[REG]], 0(%r2) +; CHECK: br %r14 + %swapped = call i16 @llvm.bswap.i16(i16 %a) + store volatile i16 %swapped, i16 *%dst + ret void +} Index: test/MC/Disassembler/SystemZ/insns.txt =================================================================== --- test/MC/Disassembler/SystemZ/insns.txt +++ test/MC/Disassembler/SystemZ/insns.txt @@ -5209,6 +5209,36 @@ # CHECK: lrvr %r15, %r15 0xb9 0x1f 0x00 0xff +# CHECK: lrvh %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x1f + +# CHECK: lrvh %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x1f + +# CHECK: lrvh %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x1f + +# CHECK: lrvh %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x1f + +# CHECK: lrvh %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x1f + +# CHECK: lrvh %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x1f + +# CHECK: lrvh %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x1f + +# CHECK: lrvh %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x1f + +# CHECK: lrvh %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x1f + +# CHECK: lrvh %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x1f + # CHECK: lrv %r0, -524288 0xe3 0x00 0x00 0x00 0x80 0x1e @@ -8500,6 +8530,36 @@ # CHECK: strvg %r15, 0 0xe3 0xf0 0x00 0x00 0x00 0x2f +# CHECK: strvh %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x3f + +# CHECK: strvh %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x3f + +# CHECK: strvh %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x3f + +# CHECK: strvh %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x3f + +# CHECK: strvh %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x3f + +# CHECK: strvh %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x3f + +# CHECK: strvh %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x3f + +# CHECK: strvh %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x3f + +# CHECK: strvh %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x3f + +# CHECK: strvh %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x3f + # CHECK: strv %r0, -524288 0xe3 0x00 0x00 0x00 0x80 0x3e Index: test/MC/SystemZ/insn-good.s =================================================================== --- test/MC/SystemZ/insn-good.s +++ test/MC/SystemZ/insn-good.s @@ -6503,6 +6503,28 @@ lrl %r7,frob@PLT lrl %r8,frob@PLT +#CHECK: lrvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1f] +#CHECK: lrvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1f] +#CHECK: lrvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1f] +#CHECK: lrvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x1f] +#CHECK: lrvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x1f] +#CHECK: lrvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x1f] +#CHECK: lrvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x1f] +#CHECK: lrvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x1f] +#CHECK: lrvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x1f] +#CHECK: lrvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x1f] + + lrvh %r0,-524288 + lrvh %r0,-1 + lrvh %r0,0 + lrvh %r0,1 + lrvh %r0,524287 + lrvh %r0,0(%r1) + lrvh %r0,0(%r15) + lrvh %r0,524287(%r1,%r15) + lrvh %r0,524287(%r15,%r1) + lrvh %r15,0 + #CHECK: lrv %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1e] #CHECK: lrv %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1e] #CHECK: lrv %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1e] @@ -8929,6 +8951,28 @@ strl %r7,frob@PLT strl %r8,frob@PLT +#CHECK: strvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3f] +#CHECK: strvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3f] +#CHECK: strvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3f] +#CHECK: strvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3f] +#CHECK: strvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3f] +#CHECK: strvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3f] +#CHECK: strvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3f] +#CHECK: strvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3f] +#CHECK: strvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3f] +#CHECK: strvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3f] + + strvh %r0,-524288 + strvh %r0,-1 + strvh %r0,0 + strvh %r0,1 + strvh %r0,524287 + strvh %r0,0(%r1) + strvh %r0,0(%r15) + strvh %r0,524287(%r1,%r15) + strvh %r0,524287(%r15,%r1) + strvh %r15,0 + #CHECK: strv %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3e] #CHECK: strv %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3e] #CHECK: strv %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3e]