diff options
| author | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
|---|---|---|
| committer | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
| commit | cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch) | |
| tree | d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoadStoreVectorizer/X86 | |
| parent | c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff) | |
| download | bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip | |
Revert "Temporarily Revert "Add basic loop fusion pass.""
The reversion apparently deleted the test/Transforms directory.
Will be re-reverting again.
llvm-svn: 358552
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer/X86')
11 files changed, 547 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll new file mode 100644 index 00000000000..e29f3dfa537 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll @@ -0,0 +1,80 @@ +; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s + +target triple = "x86_64--" + +%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } } + +@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4 + +; Function Attrs: convergent nounwind +define void @test(i32 %base) #0 { +; CHECK-LABEL: @test( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4 + %tmp296.4 = bitcast float* %arrayidx352.4 to i32* + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5 + %tmp296.5 = bitcast float* %arrayidx352.5 to i32* + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: + ; If and only if the loads are in a different BB from the GEPs codegenprepare + ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job + ; harder + %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0 + %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0 + br label %exit + +exit: + ret void +} + +; Function Attrs: convergent nounwind +define void @test.codegenprepared(i32 %base) #0 { +; CHECK-LABEL: @test.codegenprepared( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: ; preds = %entry + %sunkaddr = mul i64 %idx351.4, 4 + %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr + %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096 + %0 = bitcast i8* %sunkaddr2 to i32* + %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0 + %sunkaddr3 = mul i64 %idx351.5, 4 + %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3 + %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096 + %1 = bitcast i8* %sunkaddr5 to i32* + %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0 + br label %exit + +exit: ; preds = %loads, %entry + ret void +} + +attributes #0 = { convergent nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"float", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll new file mode 100644 index 00000000000..e2181f6086c --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll @@ -0,0 +1,77 @@ +; RUN: opt -load-store-vectorizer %s -S | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s + +; Check that setting wrapping flags after a SCEV node is created +; does not invalidate "sorted by complexity" invariant for +; operands of commutative and associative SCEV operators. + +target triple = "x86_64--" + +@global_value0 = external constant i32 +@global_value1 = external constant i32 +@other_value = external global float +@a = external global float +@b = external global float +@c = external global float +@d = external global float +@plus1 = external global i32 +@cnd = external global i8 + +; Function Attrs: nounwind +define void @main() local_unnamed_addr #0 { +; CHECK-LABEL: @main() +; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>* +; CHECK: = load <2 x float>, <2 x float>* [[PTR]] +; CHECK-LABEL: for.body23: +entry: + %tmp = load i32, i32* @global_value0, !range !0 + %tmp2 = load i32, i32* @global_value1 + %and.i.i = and i32 %tmp2, 2 + %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0 + %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1 + %and6.i.i = and i32 %tmp2, 3 + %and9.i.i = and i32 %tmp2, 4 + %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i + %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1 + %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56 + %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7 + %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0 + %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i + %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64 + %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext + %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1 + %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i + %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64 + %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1 + br label %for.body23 + +for.body23: ; preds = %for.body23, %entry + %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1 + %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72 + %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56 + %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i + %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1 + %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64 + %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext + %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1 + store float %preheader.load0., float* @a, align 4, !tbaa !1 + store float %preheader.load1., float* @b, align 4, !tbaa !1 + store float %loop.header.load0., float* @c, align 4, !tbaa !1 + store float %loop.header.load1., float* @d, align 4, !tbaa !1 + %loaded.cnd = load i8, i8* @cnd + %condition = trunc i8 %loaded.cnd to i1 + br i1 %condition, label %for.body23, label %exit + +exit: + ret void +} + +attributes #0 = { nounwind } + +!0 = !{i32 0, i32 65536} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll new file mode 100644 index 00000000000..043d6ea7e92 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,28 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: [[LOAD_PTR:%[0-9]+]] = bitcast i32* %next.gep1 +; CHECK: load <2 x i32>, <2 x i32>* [[LOAD_PTR]] +; CHECK: load i32, i32* %next.gep +; CHECK: [[STORE_PTR:%[0-9]+]] = bitcast i32* %next.gep +; CHECK: store <2 x i32> +; CHECK-SAME: <2 x i32>* [[STORE_PTR]] +; CHECK: load i32, i32* %next.gep1 +define void @correct_order(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg new file mode 100644 index 00000000000..e71f3cc4c41 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll new file mode 100644 index 00000000000..ac5f3ea9f0f --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll @@ -0,0 +1,40 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s + +define <8 x double> @loadwidth_insert_extract(double* %ptr) { + %a = bitcast double* %ptr to <2 x double> * + %b = getelementptr <2 x double>, <2 x double>* %a, i32 1 + %c = getelementptr <2 x double>, <2 x double>* %a, i32 2 + %d = getelementptr <2 x double>, <2 x double>* %a, i32 3 +; CHECK-HSW: load <4 x double> +; CHECK-HSW: load <4 x double> +; CHECK-HSW-NOT: load +; CHECK-KNL: load <8 x double> +; CHECK-KNL-NOT: load + %la = load <2 x double>, <2 x double> *%a + %lb = load <2 x double>, <2 x double> *%b + %lc = load <2 x double>, <2 x double> *%c + %ld = load <2 x double>, <2 x double> *%d + ; Scalarize everything - Explicitly not a shufflevector to test this code + ; path in the LSV + %v1 = extractelement <2 x double> %la, i32 0 + %v2 = extractelement <2 x double> %la, i32 1 + %v3 = extractelement <2 x double> %lb, i32 0 + %v4 = extractelement <2 x double> %lb, i32 1 + %v5 = extractelement <2 x double> %lc, i32 0 + %v6 = extractelement <2 x double> %lc, i32 1 + %v7 = extractelement <2 x double> %ld, i32 0 + %v8 = extractelement <2 x double> %ld, i32 1 + ; Make a vector again + %i1 = insertelement <8 x double> undef, double %v1, i32 0 + %i2 = insertelement <8 x double> %i1, double %v2, i32 1 + %i3 = insertelement <8 x double> %i2, double %v3, i32 2 + %i4 = insertelement <8 x double> %i3, double %v4, i32 3 + %i5 = insertelement <8 x double> %i4, double %v5, i32 4 + %i6 = insertelement <8 x double> %i5, double %v6, i32 5 + %i7 = insertelement <8 x double> %i6, double %v7, i32 6 + %i8 = insertelement <8 x double> %i7, double %v8, i32 7 + ret <8 x double> %i8 +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll new file mode 100644 index 00000000000..a93e9aceb73 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \ +; RUN: FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \ +; RUN: FileCheck %s +; +; The GPU Load & Store Vectorizer may merge differently-typed accesses into a +; single instruction. This test checks that we merge TBAA tags for such +; accesses correctly. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; struct S { +; float f; +; int i; +; }; +%struct.S = type { float, i32 } + +; float foo(S *p) { +; p->f -= 1; +; p->i -= 1; +; return p->f; +; } +define float @foo(%struct.S* %p) { +entry: +; CHECK-LABEL: foo +; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]] +; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]] + %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0 + %0 = load float, float* %f, align 4, !tbaa !2 + %sub = fadd float %0, -1.000000e+00 + store float %sub, float* %f, align 4, !tbaa !2 + %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1 + %1 = load i32, i32* %i, align 4, !tbaa !8 + %sub1 = add nsw i32 %1, -1 + store i32 %sub1, i32* %i, align 4, !tbaa !8 + ret float %sub +} + +!2 = !{!3, !4, i64 0} +!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4} +!4 = !{!"float", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} +!7 = !{!"int", !5, i64 0} +!8 = !{!3, !7, i64 4} + +; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0} +; CHECK-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll new file mode 100644 index 00000000000..7a0073808a0 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +%rec = type { i32, i28 } + +; We currently do not optimize this scenario. +; But we verify that we no longer crash when compiling this. +define void @test1(%rec* %out, %rec* %in) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[IN1:%.*]] = getelementptr [[REC:%.*]], %rec* [[IN:%.*]], i16 0, i32 0 +; CHECK-NEXT: [[IN2:%.*]] = getelementptr [[REC]], %rec* [[IN]], i16 0, i32 1 +; CHECK-NEXT: [[VAL1:%.*]] = load i32, i32* [[IN1]], align 8 +; CHECK-NEXT: [[VAL2:%.*]] = load i28, i28* [[IN2]] +; CHECK-NEXT: [[OUT1:%.*]] = getelementptr [[REC]], %rec* [[OUT:%.*]], i16 0, i32 0 +; CHECK-NEXT: [[OUT2:%.*]] = getelementptr [[REC]], %rec* [[OUT]], i16 0, i32 1 +; CHECK-NEXT: store i32 [[VAL1]], i32* [[OUT1]], align 8 +; CHECK-NEXT: store i28 [[VAL2]], i28* [[OUT2]] +; CHECK-NEXT: ret void +; + %in1 = getelementptr %rec, %rec* %in, i16 0, i32 0 + %in2 = getelementptr %rec, %rec* %in, i16 0, i32 1 + %val1 = load i32, i32* %in1, align 8 + %val2 = load i28, i28* %in2 + %out1 = getelementptr %rec, %rec* %out, i16 0, i32 0 + %out2 = getelementptr %rec, %rec* %out, i16 0, i32 1 + store i32 %val1, i32* %out1, align 8 + store i28 %val2, i28* %out2 + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll new file mode 100644 index 00000000000..3cfe7454baf --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86_64-unknown-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +%struct.buffer_t = type { i32, i8* } + +; Check an i32 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. + +; CHECK-LABEL: @preserve_order_32( +; CHECK: load <2 x i32> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0 + %buff.int = load i32, i32* %tmp0, align 8 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll new file mode 100644 index 00000000000..3ae0d891dc5 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -0,0 +1,78 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +%struct.buffer_t = type { i64, i8* } +%struct.nested.buffer = type { %struct.buffer_t, %struct.buffer_t } + +; Check an i64 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. + +; CHECK-LABEL: @preserve_order_64( +; CHECK: load <2 x i64> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + ret void +} + +; Check reordering recurses correctly. + +; CHECK-LABEL: @transitive_reorder( +; CHECK: load <2 x i64> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @transitive_reorder(%struct.buffer_t* noalias %buff, %struct.nested.buffer* noalias %nest) #0 { +entry: + %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + ret void +} + +; Check for no vectorization over phi node + +; CHECK-LABEL: @no_vect_phi( +; CHECK: load i8* +; CHECK: load i8 +; CHECK: store i8 0 +; CHECK: load i64 +define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll new file mode 100644 index 00000000000..72b29912d81 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,118 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load <2 x i32> +define void @chain_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load <2 x i32> +; CHECK: store <2 x i32> +; CHECK: load <3 x i32> +define void @chain_prefix_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + %next.gep3 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + ret void +} + +; FIXME: If the chain is too long and TLI says misaligned is not fast, +; then LSV fails to vectorize anything in that chain. +; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <3 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32* noalias %ptr) { + %tmp1 = getelementptr i32, i32* %ptr, i64 0 + %tmp2 = getelementptr i32, i32* %ptr, i64 1 + %tmp3 = getelementptr i32, i32* %ptr, i64 2 + %tmp4 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %tmp2, align 4 + %l2 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp2, align 4 + store i32 0, i32* %tmp1, align 4 + %l3 = load i32, i32* %tmp2, align 4 + %l4 = load i32, i32* %tmp3, align 4 + %l5 = load i32, i32* %tmp4, align 4 + %l6 = load i32, i32* %tmp4, align 4 + %l7 = load i32, i32* %tmp4, align 4 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll new file mode 100644 index 00000000000..00971f35038 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll @@ -0,0 +1,15 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s + +; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T. + +; CHECK-LABEL: @vector_scalar( +; CHECK: store double +; CHECK: store <1 x double> +define void @vector_scalar(double* %ptr, double %a, <1 x double> %b) { + %1 = bitcast double* %ptr to <1 x double>* + %2 = getelementptr <1 x double>, <1 x double>* %1, i32 1 + store double %a, double* %ptr, align 8 + store <1 x double> %b, <1 x double>* %2, align 8 + ret void +} |

