summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU
diff options
context:
space:
mode:
authorFarhana Aleen <farhana.aleen@gmail.com>2018-07-19 16:50:27 +0000
committerFarhana Aleen <farhana.aleen@gmail.com>2018-07-19 16:50:27 +0000
commit8c7a30baea219e8143b13e3e384ff713d8bb7c76 (patch)
tree6f6740fbacbb2020321e1c3833f2b13403986b17 /llvm/test/Transforms/LoadStoreVectorizer/AMDGPU
parentd1cf276621a7382a0f8e1d6f70d317e3944ffbeb (diff)
downloadbcm5719-llvm-8c7a30baea219e8143b13e3e384ff713d8bb7c76.tar.gz
bcm5719-llvm-8c7a30baea219e8143b13e3e384ff713d8bb7c76.zip
[LoadStoreVectorizer] Use getMinusScev() to compute the distance between two pointers.
Summary: Currently, isConsecutiveAccess() detects two pointers(PtrA and PtrB) as consecutive by comparing PtrB with BaseDelta+PtrA. This works when both pointers are factorized or both of them are not factorized. But isConsecutiveAccess() fails if one of the pointers is factorized but the other one is not. Here is an example: PtrA = 4 * (A + B) PtrB = 4 + 4A + 4B This patch uses getMinusSCEV() to compute the distance between two pointers. getMinusSCEV() allows combining the expressions and computing the simplified distance. Author: FarhanaAleen Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D49516 llvm-svn: 337471
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer/AMDGPU')
-rw-r--r--llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll49
1 files changed, 49 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
new file mode 100644
index 00000000000..220efd21fe1
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+declare i64 @_Z12get_local_idj(i32)
+
+declare i64 @_Z12get_group_idj(i32)
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
+entry:
+ %call = tail call i64 @_Z12get_local_idj(i32 0)
+ %call1 = tail call i64 @_Z12get_group_idj(i32 0)
+ %div = lshr i64 %call, 4
+ %div2 = lshr i64 %call1, 3
+ %mul = shl i64 %div2, 7
+ %rem = shl i64 %call, 3
+ %mul3 = and i64 %rem, 120
+ %add = or i64 %mul, %mul3
+ %rem4 = shl i64 %call1, 7
+ %mul5 = and i64 %rem4, 896
+ %mul6 = shl nuw nsw i64 %div, 3
+ %add7 = add nuw i64 %mul5, %mul6
+ %mul9 = shl i64 %add7, 10
+ %add10 = add i64 %mul9, %add
+ %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
+ %load1 = load float, float addrspace(1)* %arrayidx, align 4
+ %conv = fpext float %load1 to double
+ %mul11 = fmul double %conv, 0x3FEAB481D8F35506
+ %conv12 = fptrunc double %mul11 to float
+ %conv18 = fpext float %conv12 to double
+ %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
+ %cstoreval1 = fptrunc double %storeval1 to float
+ store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
+
+ %add23 = or i64 %add10, 1
+ %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
+ %load2 = load float, float addrspace(1)* %arrayidx24, align 4
+ %conv25 = fpext float %load2 to double
+ %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
+ %conv27 = fptrunc double %mul26 to float
+ %conv34 = fpext float %conv27 to double
+ %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
+ %cstoreval2 = fptrunc double %storeval2 to float
+ store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
+ ret void
+} \ No newline at end of file
OpenPOWER on IntegriCloud