Skip to content

Commit 71d81e5

Browse files
committedJul 25, 2018
bpf: new option -bpf-expand-memcpy-in-order to expand memcpy in order
Some BPF JIT backends would want to optimize memcpy in their own architecture specific way. However, at the moment, there is no way for JIT backends to see memcpy semantics in a reliable way. This is due to LLVM BPF backend is expanding memcpy into load/store sequences and could possibly schedule them apart from each other further. So, BPF JIT backends inside kernel can't reliably recognize memcpy semantics by peephole BPF sequence. This patch introduce new intrinsic expand infrastructure to memcpy. To get stable in-order load/store sequence from memcpy, we first lower memcpy into BPF::MEMCPY node which then expanded into in-order load/store sequences in expandPostRAPseudo pass which will happen after instruction scheduling. By this way, kernel JIT backends could reliably recognize memcpy through scanning BPF sequence. This new memcpy expand infrastructure is gated by a new option: -bpf-expand-memcpy-in-order Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com> Signed-off-by: Jiong Wang <jiong.wang@netronome.com> Signed-off-by: Yonghong Song <yhs@fb.com> llvm-svn: 337977
1 parent 99ca3c0 commit 71d81e5

10 files changed

+371
-8
lines changed
 

‎llvm/lib/Target/BPF/BPFISelLowering.cpp

+68-5
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ using namespace llvm;
3333

3434
#define DEBUG_TYPE "bpf-lower"
3535

36+
static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
37+
cl::Hidden, cl::init(false),
38+
cl::desc("Expand memcpy into load/store pairs in order"));
39+
3640
static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
3741
MachineFunction &MF = DAG.getMachineFunction();
3842
DAG.getContext()->diagnose(
@@ -132,10 +136,30 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
132136
setMinFunctionAlignment(3);
133137
setPrefFunctionAlignment(3);
134138

135-
// inline memcpy() for kernel to see explicit copy
136-
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
137-
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
138-
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
139+
if (BPFExpandMemcpyInOrder) {
140+
// LLVM generic code will try to expand memcpy into load/store pairs at this
141+
// stage which is before quite a few IR optimization passes, therefore the
142+
// loads and stores could potentially be moved apart from each other which
143+
// will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
144+
// compilers.
145+
//
146+
// When -bpf-expand-memcpy-in-order specified, we want to defer the expand
147+
// of memcpy to later stage in IR optimization pipeline so those load/store
148+
// pairs won't be touched and could be kept in order. Hence, we set
149+
// MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
150+
// code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
151+
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
152+
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
153+
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
154+
} else {
155+
// inline memcpy() for kernel to see explicit copy
156+
unsigned CommonMaxStores =
157+
STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
158+
159+
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
160+
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
161+
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
162+
}
139163

140164
// CPU/Feature control
141165
HasAlu32 = STI.getHasAlu32();
@@ -518,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
518542
return "BPFISD::BR_CC";
519543
case BPFISD::Wrapper:
520544
return "BPFISD::Wrapper";
545+
case BPFISD::MEMCPY:
546+
return "BPFISD::MEMCPY";
521547
}
522548
return nullptr;
523549
}
@@ -556,6 +582,37 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
556582
return PromotedReg2;
557583
}
558584

585+
MachineBasicBlock *
586+
BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
587+
MachineBasicBlock *BB)
588+
const {
589+
MachineFunction *MF = MI.getParent()->getParent();
590+
MachineRegisterInfo &MRI = MF->getRegInfo();
591+
MachineInstrBuilder MIB(*MF, MI);
592+
unsigned ScratchReg;
593+
594+
// This function does custom insertion during lowering BPFISD::MEMCPY which
595+
// only has two register operands from memcpy semantics, the copy source
596+
// address and the copy destination address.
597+
//
598+
// Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
599+
// a third scratch register to serve as the destination register of load and
600+
// source register of store.
601+
//
602+
// The scratch register here is with the Define | Dead | EarlyClobber flags.
603+
// The EarlyClobber flag has the semantic property that the operand it is
604+
// attached to is clobbered before the rest of the inputs are read. Hence it
605+
// must be unique among the operands to the instruction. The Define flag is
606+
// needed to coerce the machine verifier that an Undef value isn't a problem
607+
// as we anyway is loading memory into it. The Dead flag is needed as the
608+
// value in scratch isn't supposed to be used by any other instruction.
609+
ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
610+
MIB.addReg(ScratchReg,
611+
RegState::Define | RegState::Dead | RegState::EarlyClobber);
612+
613+
return BB;
614+
}
615+
559616
MachineBasicBlock *
560617
BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
561618
MachineBasicBlock *BB) const {
@@ -567,16 +624,22 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
567624
Opc == BPF::Select_32 ||
568625
Opc == BPF::Select_32_64);
569626

627+
bool isMemcpyOp = Opc == BPF::MEMCPY;
628+
570629
#ifndef NDEBUG
571630
bool isSelectRIOp = (Opc == BPF::Select_Ri ||
572631
Opc == BPF::Select_Ri_64_32 ||
573632
Opc == BPF::Select_Ri_32 ||
574633
Opc == BPF::Select_Ri_32_64);
575634

576635

577-
assert((isSelectRROp || isSelectRIOp) && "Unexpected instr type to insert");
636+
assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
637+
"Unexpected instr type to insert");
578638
#endif
579639

640+
if (isMemcpyOp)
641+
return EmitInstrWithCustomInserterMemcpy(MI, BB);
642+
580643
bool is32BitCmp = (Opc == BPF::Select_32 ||
581644
Opc == BPF::Select_32_64 ||
582645
Opc == BPF::Select_Ri_32 ||

‎llvm/lib/Target/BPF/BPFISelLowering.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ enum NodeType : unsigned {
2828
CALL,
2929
SELECT_CC,
3030
BR_CC,
31-
Wrapper
31+
Wrapper,
32+
MEMCPY
3233
};
3334
}
3435

@@ -110,6 +111,11 @@ class BPFTargetLowering : public TargetLowering {
110111

111112
unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
112113
bool isSigned) const;
114+
115+
MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
116+
MachineBasicBlock *BB)
117+
const;
118+
113119
};
114120
}
115121

‎llvm/lib/Target/BPF/BPFInstrInfo.cpp

+77
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,83 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4343
llvm_unreachable("Impossible reg-to-reg copy");
4444
}
4545

46+
void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
47+
unsigned DstReg = MI->getOperand(0).getReg();
48+
unsigned SrcReg = MI->getOperand(1).getReg();
49+
uint64_t CopyLen = MI->getOperand(2).getImm();
50+
uint64_t Alignment = MI->getOperand(3).getImm();
51+
unsigned ScratchReg = MI->getOperand(4).getReg();
52+
MachineBasicBlock *BB = MI->getParent();
53+
DebugLoc dl = MI->getDebugLoc();
54+
unsigned LdOpc, StOpc;
55+
56+
switch (Alignment) {
57+
case 1:
58+
LdOpc = BPF::LDB;
59+
StOpc = BPF::STB;
60+
break;
61+
case 2:
62+
LdOpc = BPF::LDH;
63+
StOpc = BPF::STH;
64+
break;
65+
case 4:
66+
LdOpc = BPF::LDW;
67+
StOpc = BPF::STW;
68+
break;
69+
case 8:
70+
LdOpc = BPF::LDD;
71+
StOpc = BPF::STD;
72+
break;
73+
default:
74+
llvm_unreachable("unsupported memcpy alignment");
75+
}
76+
77+
unsigned IterationNum = CopyLen >> Log2_64(Alignment);
78+
for(unsigned I = 0; I < IterationNum; ++I) {
79+
BuildMI(*BB, MI, dl, get(LdOpc))
80+
.addReg(ScratchReg).addReg(SrcReg).addImm(I * Alignment);
81+
BuildMI(*BB, MI, dl, get(StOpc))
82+
.addReg(ScratchReg).addReg(DstReg).addImm(I * Alignment);
83+
}
84+
85+
unsigned BytesLeft = CopyLen & (Alignment - 1);
86+
unsigned Offset = IterationNum * Alignment;
87+
bool Hanging4Byte = BytesLeft & 0x4;
88+
bool Hanging2Byte = BytesLeft & 0x2;
89+
bool Hanging1Byte = BytesLeft & 0x1;
90+
if (Hanging4Byte) {
91+
BuildMI(*BB, MI, dl, get(BPF::LDW))
92+
.addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
93+
BuildMI(*BB, MI, dl, get(BPF::STW))
94+
.addReg(ScratchReg).addReg(DstReg).addImm(Offset);
95+
Offset += 4;
96+
}
97+
if (Hanging2Byte) {
98+
BuildMI(*BB, MI, dl, get(BPF::LDH))
99+
.addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
100+
BuildMI(*BB, MI, dl, get(BPF::STH))
101+
.addReg(ScratchReg).addReg(DstReg).addImm(Offset);
102+
Offset += 2;
103+
}
104+
if (Hanging1Byte) {
105+
BuildMI(*BB, MI, dl, get(BPF::LDB))
106+
.addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
107+
BuildMI(*BB, MI, dl, get(BPF::STB))
108+
.addReg(ScratchReg).addReg(DstReg).addImm(Offset);
109+
}
110+
111+
BB->erase(MI);
112+
}
113+
114+
bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
115+
if (MI.getOpcode() == BPF::MEMCPY) {
116+
expandMEMCPY(MI);
117+
return true;
118+
}
119+
120+
return false;
121+
}
122+
46123
void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
47124
MachineBasicBlock::iterator I,
48125
unsigned SrcReg, bool IsKill, int FI,

‎llvm/lib/Target/BPF/BPFInstrInfo.h

+5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class BPFInstrInfo : public BPFGenInstrInfo {
3434
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
3535
bool KillSrc) const override;
3636

37+
bool expandPostRAPseudo(MachineInstr &MI) const override;
38+
3739
void storeRegToStackSlot(MachineBasicBlock &MBB,
3840
MachineBasicBlock::iterator MBBI, unsigned SrcReg,
3941
bool isKill, int FrameIndex,
@@ -55,6 +57,9 @@ class BPFInstrInfo : public BPFGenInstrInfo {
5557
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
5658
const DebugLoc &DL,
5759
int *BytesAdded = nullptr) const override;
60+
private:
61+
void expandMEMCPY(MachineBasicBlock::iterator) const;
62+
5863
};
5964
}
6065

‎llvm/lib/Target/BPF/BPFInstrInfo.td

+15
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ def SDT_BPFBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
2828
SDTCisVT<3, OtherVT>]>;
2929
def SDT_BPFWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
3030
SDTCisPtrTy<0>]>;
31+
def SDT_BPFMEMCPY : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
32+
SDTCisVT<1, i64>,
33+
SDTCisVT<2, i64>,
34+
SDTCisVT<3, i64>]>;
3135

3236
def BPFcall : SDNode<"BPFISD::CALL", SDT_BPFCall,
3337
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -43,6 +47,9 @@ def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
4347

4448
def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
4549
def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
50+
def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
51+
[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
52+
SDNPMayStore, SDNPMayLoad]>;
4653
def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
4754
def BPFIsBigEndian : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
4855
def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
@@ -714,3 +721,11 @@ let Predicates = [BPFHasALU32] in {
714721
def : Pat<(i64 (extloadi32 ADDRri:$src)),
715722
(SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
716723
}
724+
725+
let usesCustomInserter = 1, isCodeGenOnly = 1 in {
726+
def MEMCPY : Pseudo<
727+
(outs),
728+
(ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
729+
"#memcpy dst: $dst, src: $src, len: $len, align: $align",
730+
[(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
731+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file implements the BPFSelectionDAGInfo class.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "BPFTargetMachine.h"
15+
#include "llvm/CodeGen/SelectionDAG.h"
16+
#include "llvm/IR/DerivedTypes.h"
17+
using namespace llvm;
18+
19+
#define DEBUG_TYPE "bpf-selectiondag-info"
20+
21+
SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
22+
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
23+
SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
24+
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
25+
// Requires the copy size to be a constant.
26+
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
27+
if (!ConstantSize)
28+
return SDValue();
29+
30+
unsigned CopyLen = ConstantSize->getZExtValue();
31+
unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
32+
// Impose the same copy length limit as MaxStoresPerMemcpy.
33+
if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
34+
return SDValue();
35+
36+
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
37+
38+
Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
39+
DAG.getConstant(CopyLen, dl, MVT::i64),
40+
DAG.getConstant(Align, dl, MVT::i64));
41+
42+
return Dst.getValue(0);
43+
}
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file defines the BPF subclass for SelectionDAGTargetInfo.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
15+
#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
16+
17+
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
18+
19+
namespace llvm {
20+
21+
class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
22+
public:
23+
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
24+
SDValue Chain, SDValue Dst, SDValue Src,
25+
SDValue Size, unsigned Align, bool isVolatile,
26+
bool AlwaysInline,
27+
MachinePointerInfo DstPtrInfo,
28+
MachinePointerInfo SrcPtrInfo) const override;
29+
30+
unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
31+
32+
};
33+
34+
}
35+
36+
#endif

‎llvm/lib/Target/BPF/BPFSubtarget.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "BPFFrameLowering.h"
1818
#include "BPFISelLowering.h"
1919
#include "BPFInstrInfo.h"
20+
#include "BPFSelectionDAGInfo.h"
2021
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
2122
#include "llvm/CodeGen/TargetSubtargetInfo.h"
2223
#include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
3334
BPFInstrInfo InstrInfo;
3435
BPFFrameLowering FrameLowering;
3536
BPFTargetLowering TLInfo;
36-
SelectionDAGTargetInfo TSInfo;
37+
BPFSelectionDAGInfo TSInfo;
3738

3839
private:
3940
void initializeEnvironment();
@@ -75,7 +76,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
7576
const BPFTargetLowering *getTargetLowering() const override {
7677
return &TLInfo;
7778
}
78-
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
79+
const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
7980
return &TSInfo;
8081
}
8182
const TargetRegisterInfo *getRegisterInfo() const override {

‎llvm/lib/Target/BPF/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_llvm_target(BPFCodeGen
2020
BPFISelLowering.cpp
2121
BPFMCInstLower.cpp
2222
BPFRegisterInfo.cpp
23+
BPFSelectionDAGInfo.cpp
2324
BPFSubtarget.cpp
2425
BPFTargetMachine.cpp
2526
BPFMIPeephole.cpp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
; RUN: llc < %s -march=bpfel -bpf-expand-memcpy-in-order | FileCheck %s
2+
; RUN: llc < %s -march=bpfeb -bpf-expand-memcpy-in-order | FileCheck %s
3+
;
4+
; #define COPY_LEN 9
5+
;
6+
; void cal_align1(void *a, void *b)
7+
; {
8+
; __builtin_memcpy(a, b, COPY_LEN);
9+
; }
10+
;
11+
; void cal_align2(short *a, short *b)
12+
; {
13+
; __builtin_memcpy(a, b, COPY_LEN);
14+
; }
15+
;
16+
; #undef COPY_LEN
17+
; #define COPY_LEN 19
18+
; void cal_align4(int *a, int *b)
19+
; {
20+
; __builtin_memcpy(a, b, COPY_LEN);
21+
; }
22+
;
23+
; #undef COPY_LEN
24+
; #define COPY_LEN 27
25+
; void cal_align8(long long *a, long long *b)
26+
; {
27+
; __builtin_memcpy(a, b, COPY_LEN);
28+
; }
29+
30+
; Function Attrs: nounwind
31+
define dso_local void @cal_align1(i8* nocapture %a, i8* nocapture readonly %b) local_unnamed_addr #0 {
32+
entry:
33+
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 9, i1 false)
34+
ret void
35+
}
36+
37+
; Function Attrs: argmemonly nounwind
38+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
39+
40+
; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u8 *)([[SRC_REG:r[0-9]]] + 0)
41+
; CHECK: *(u8 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
42+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 1)
43+
; CHECK: *(u8 *)([[DST_REG]] + 1) = [[SCRATCH_REG]]
44+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 2)
45+
; CHECK: *(u8 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
46+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 3)
47+
; CHECK: *(u8 *)([[DST_REG]] + 3) = [[SCRATCH_REG]]
48+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 4)
49+
; CHECK: *(u8 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
50+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 5)
51+
; CHECK: *(u8 *)([[DST_REG]] + 5) = [[SCRATCH_REG]]
52+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 6)
53+
; CHECK: *(u8 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
54+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 7)
55+
; CHECK: *(u8 *)([[DST_REG]] + 7) = [[SCRATCH_REG]]
56+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
57+
; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
58+
59+
; Function Attrs: nounwind
60+
define dso_local void @cal_align2(i16* nocapture %a, i16* nocapture readonly %b) local_unnamed_addr #0 {
61+
entry:
62+
%0 = bitcast i16* %a to i8*
63+
%1 = bitcast i16* %b to i8*
64+
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 %0, i8* align 2 %1, i64 9, i1 false)
65+
ret void
66+
}
67+
; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u16 *)([[SRC_REG:r[0-9]]] + 0)
68+
; CHECK: *(u16 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
69+
; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 2)
70+
; CHECK: *(u16 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
71+
; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 4)
72+
; CHECK: *(u16 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
73+
; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 6)
74+
; CHECK: *(u16 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
75+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
76+
; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
77+
78+
; Function Attrs: nounwind
79+
define dso_local void @cal_align4(i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
80+
entry:
81+
%0 = bitcast i32* %a to i8*
82+
%1 = bitcast i32* %b to i8*
83+
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 19, i1 false)
84+
ret void
85+
}
86+
; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u32 *)([[SRC_REG:r[0-9]]] + 0)
87+
; CHECK: *(u32 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
88+
; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 4)
89+
; CHECK: *(u32 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
90+
; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 8)
91+
; CHECK: *(u32 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
92+
; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 12)
93+
; CHECK: *(u32 *)([[DST_REG]] + 12) = [[SCRATCH_REG]]
94+
; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 16)
95+
; CHECK: *(u16 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
96+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 18)
97+
; CHECK: *(u8 *)([[DST_REG]] + 18) = [[SCRATCH_REG]]
98+
99+
; Function Attrs: nounwind
100+
define dso_local void @cal_align8(i64* nocapture %a, i64* nocapture readonly %b) local_unnamed_addr #0 {
101+
entry:
102+
%0 = bitcast i64* %a to i8*
103+
%1 = bitcast i64* %b to i8*
104+
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 27, i1 false)
105+
ret void
106+
}
107+
; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u64 *)([[SRC_REG:r[0-9]]] + 0)
108+
; CHECK: *(u64 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
109+
; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 8)
110+
; CHECK: *(u64 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
111+
; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 16)
112+
; CHECK: *(u64 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
113+
; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 24)
114+
; CHECK: *(u16 *)([[DST_REG]] + 24) = [[SCRATCH_REG]]
115+
; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 26)
116+
; CHECK: *(u8 *)([[DST_REG]] + 26) = [[SCRATCH_REG]]

0 commit comments

Comments
 (0)
Please sign in to comment.