Skip to content

Commit a6578f7

Browse files
committedApr 24, 2013
LoopVectorize: Scalarize padded types
This patch disables memory-instruction vectorization for types that need padding bytes, e.g., x86_fp80 has 10 bytes store size with 6 bytes padding in darwin on x86_64. Because the load/store vectorization is performed by the bit casting to a packed vector, which has incompatible memory layout due to the lack of padding bytes, the present vectorizer produces inconsistent result for memory instructions of those types. This patch checks an equality of the AllocSize of a scalar type and allocated size for each vector element, to ensure that there is no padding bytes and the array can be read/written using vector operations. Patch by Daisuke Takahashi! Fixes PR15758. llvm-svn: 180196
1 parent 23a0589 commit a6578f7

File tree

2 files changed

+38
-1
lines changed

2 files changed

+38
-1
lines changed
 

‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
956956
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
957957
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
958958

959+
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
960+
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
961+
962+
if (ScalarAllocatedSize != VectorElementSize)
963+
return scalarizeInstruction(Instr);
964+
959965
// If the pointer is loop invariant or if it is non consecutive,
960966
// scalarize the load.
961967
int Stride = Legal->isConsecutivePtr(Ptr);
@@ -3558,7 +3564,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
35583564
// Scalarized loads/stores.
35593565
int Stride = Legal->isConsecutivePtr(Ptr);
35603566
bool Reverse = Stride < 0;
3561-
if (0 == Stride) {
3567+
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
3568+
unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
3569+
if (0 == Stride || ScalarAllocatedSize != VectorElementSize) {
35623570
unsigned Cost = 0;
35633571
// The cost of extracting from the value vector and pointer vector.
35643572
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; RUN: opt -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -S < %s | FileCheck %s
2+
3+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4+
target triple = "x86_64-apple-macosx10.7.0"
5+
6+
@x = common global [1024 x x86_fp80] zeroinitializer, align 16
7+
8+
;CHECK: @example
9+
;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
10+
;CHECK: store
11+
;CHECK: ret void
12+
13+
define void @example() nounwind ssp uwtable {
14+
entry:
15+
br label %for.body
16+
17+
for.body: ; preds = %for.body, %entry
18+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
19+
%conv = sitofp i32 1 to x86_fp80
20+
%arrayidx = getelementptr inbounds [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
21+
store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
22+
%indvars.iv.next = add i64 %indvars.iv, 1
23+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
24+
%exitcond = icmp eq i32 %lftr.wideiv, 1024
25+
br i1 %exitcond, label %for.end, label %for.body
26+
27+
for.end: ; preds = %for.body
28+
ret void
29+
}

0 commit comments

Comments
 (0)
Please sign in to comment.