summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
Commit message (Collapse)AuthorAgeFilesLines
* [NFC] Fixes -Wrange-loop-analysis warningsMark de Wever2020-01-011-1/+1
| | | | | | This avoids new warnings due to D68912 adds -Wrange-loop-analysis to -Wall. Differential Revision: https://reviews.llvm.org/D71857
* Revert "[DependenceAnalysis] Dependecies for loads marked with ↵Benjamin Kramer2019-11-211-19/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151" Summary: Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151" This reverts commit 5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5. We're (tensorflow.org/xla team) seeing some misscompiles with the new change, only at -O3, with fast math disabled. I'm still trying to come up with a useful/small/external example, but for now, the following IR: ``` ; ModuleID = '__compute_module' source_filename = "__compute_module" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" @0 = private unnamed_addr constant [4 x i8] c"\DB\0F\C9@" @1 = private unnamed_addr constant [4 x i8] c"\00\00\00?" ; Function Attrs: uwtable define void @jit_wrapped_fun.31(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 { entry: %fusion.invar_address.dim.2 = alloca i64 %fusion.invar_address.dim.1 = alloca i64 %fusion.invar_address.dim.0 = alloca i64 %fusion.1.invar_address.dim.2 = alloca i64 %fusion.1.invar_address.dim.1 = alloca i64 %fusion.1.invar_address.dim.0 = alloca i64 %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1 %1 = load i8*, i8** %0, !invariant.load !0, !dereferenceable !1, !align !2 %parameter.3 = bitcast i8* %1 to [2 x [1 x [4 x float]]]* %2 = getelementptr inbounds i8*, i8** %buffer_table, i64 5 %3 = load i8*, i8** %2, !invariant.load !0, !dereferenceable !1, !align !2 %fusion.1 = bitcast i8* %3 to [2 x [1 x [4 x float]]]* store i64 0, i64* %fusion.1.invar_address.dim.0 br label %fusion.1.loop_header.dim.0 fusion.1.loop_header.dim.0: ; preds = %fusion.1.loop_exit.dim.1, %entry %fusion.1.indvar.dim.0 = load i64, i64* %fusion.1.invar_address.dim.0 %4 = icmp uge i64 %fusion.1.indvar.dim.0, 2 br i1 %4, label %fusion.1.loop_exit.dim.0, label %fusion.1.loop_body.dim.0 fusion.1.loop_body.dim.0: ; preds = %fusion.1.loop_header.dim.0 store i64 0, i64* %fusion.1.invar_address.dim.1 br label %fusion.1.loop_header.dim.1 fusion.1.loop_header.dim.1: ; preds = %fusion.1.loop_exit.dim.2, %fusion.1.loop_body.dim.0 %fusion.1.indvar.dim.1 = load i64, i64* %fusion.1.invar_address.dim.1 %5 = icmp uge i64 %fusion.1.indvar.dim.1, 1 br i1 %5, label %fusion.1.loop_exit.dim.1, label %fusion.1.loop_body.dim.1 fusion.1.loop_body.dim.1: ; preds = %fusion.1.loop_header.dim.1 store i64 0, i64* %fusion.1.invar_address.dim.2 br label %fusion.1.loop_header.dim.2 fusion.1.loop_header.dim.2: ; preds = %fusion.1.loop_body.dim.2, %fusion.1.loop_body.dim.1 %fusion.1.indvar.dim.2 = load i64, i64* %fusion.1.invar_address.dim.2 %6 = icmp uge i64 %fusion.1.indvar.dim.2, 4 br i1 %6, label %fusion.1.loop_exit.dim.2, label %fusion.1.loop_body.dim.2 fusion.1.loop_body.dim.2: ; preds = %fusion.1.loop_header.dim.2 %7 = load float, float* bitcast ([4 x i8]* @0 to float*) %8 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2 %9 = load float, float* %8, !invariant.load !0, !noalias !3 %10 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2 %11 = load float, float* %10, !invariant.load !0, !noalias !3 %12 = fmul float %9, %11 %13 = fmul float %7, %12 %14 = call float @llvm.log.f32(float %13) %15 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2 store float %14, float* %15, !alias.scope !7, !noalias !8 %invar.inc2 = add nuw nsw i64 %fusion.1.indvar.dim.2, 1 store i64 %invar.inc2, i64* %fusion.1.invar_address.dim.2 br label %fusion.1.loop_header.dim.2 fusion.1.loop_exit.dim.2: ; preds = %fusion.1.loop_header.dim.2 %invar.inc1 = add nuw nsw i64 %fusion.1.indvar.dim.1, 1 store i64 %invar.inc1, i64* %fusion.1.invar_address.dim.1 br label %fusion.1.loop_header.dim.1 fusion.1.loop_exit.dim.1: ; preds = %fusion.1.loop_header.dim.1 %invar.inc = add nuw nsw i64 %fusion.1.indvar.dim.0, 1 store i64 %invar.inc, i64* %fusion.1.invar_address.dim.0 br label %fusion.1.loop_header.dim.0 fusion.1.loop_exit.dim.0: ; preds = %fusion.1.loop_header.dim.0 %16 = getelementptr inbounds i8*, i8** %buffer_table, i64 4 %17 = load i8*, i8** %16, !invariant.load !0, !dereferenceable !9, !align !2 %parameter.1 = bitcast i8* %17 to float* %18 = getelementptr inbounds i8*, i8** %buffer_table, i64 2 %19 = load i8*, i8** %18, !invariant.load !0, !dereferenceable !10, !align !2 %parameter.2 = bitcast i8* %19 to [3 x [1 x float]]* %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 0 %21 = load i8*, i8** %20, !invariant.load !0, !dereferenceable !11, !align !2 %fusion = bitcast i8* %21 to [2 x [3 x [4 x float]]]* store i64 0, i64* %fusion.invar_address.dim.0 br label %fusion.loop_header.dim.0 fusion.loop_header.dim.0: ; preds = %fusion.loop_exit.dim.1, %fusion.1.loop_exit.dim.0 %fusion.indvar.dim.0 = load i64, i64* %fusion.invar_address.dim.0 %22 = icmp uge i64 %fusion.indvar.dim.0, 2 br i1 %22, label %fusion.loop_exit.dim.0, label %fusion.loop_body.dim.0 fusion.loop_body.dim.0: ; preds = %fusion.loop_header.dim.0 store i64 0, i64* %fusion.invar_address.dim.1 br label %fusion.loop_header.dim.1 fusion.loop_header.dim.1: ; preds = %fusion.loop_exit.dim.2, %fusion.loop_body.dim.0 %fusion.indvar.dim.1 = load i64, i64* %fusion.invar_address.dim.1 %23 = icmp uge i64 %fusion.indvar.dim.1, 3 br i1 %23, label %fusion.loop_exit.dim.1, label %fusion.loop_body.dim.1 fusion.loop_body.dim.1: ; preds = %fusion.loop_header.dim.1 store i64 0, i64* %fusion.invar_address.dim.2 br label %fusion.loop_header.dim.2 fusion.loop_header.dim.2: ; preds = %fusion.loop_body.dim.2, %fusion.loop_body.dim.1 %fusion.indvar.dim.2 = load i64, i64* %fusion.invar_address.dim.2 %24 = icmp uge i64 %fusion.indvar.dim.2, 4 br i1 %24, label %fusion.loop_exit.dim.2, label %fusion.loop_body.dim.2 fusion.loop_body.dim.2: ; preds = %fusion.loop_header.dim.2 %25 = mul nuw nsw i64 %fusion.indvar.dim.2, 1 %26 = add nuw nsw i64 0, %25 %27 = udiv i64 %26, 4 %28 = mul nuw nsw i64 %fusion.indvar.dim.0, 1 %29 = add nuw nsw i64 0, %28 %30 = udiv i64 %29, 2 %31 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %29, i64 0, i64 %26 %32 = load float, float* %31, !alias.scope !7, !noalias !8 %33 = mul nuw nsw i64 %fusion.indvar.dim.1, 1 %34 = add nuw nsw i64 0, %33 %35 = udiv i64 %34, 3 %36 = load float, float* %parameter.1, !invariant.load !0, !noalias !3 %37 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %parameter.2, i64 0, i64 %34, i64 0 %38 = load float, float* %37, !invariant.load !0, !noalias !3 %39 = fsub float %36, %38 %40 = fmul float %39, %39 %41 = mul nuw nsw i64 %fusion.indvar.dim.2, 1 %42 = add nuw nsw i64 0, %41 %43 = udiv i64 %42, 4 %44 = mul nuw nsw i64 %fusion.indvar.dim.0, 1 %45 = add nuw nsw i64 0, %44 %46 = udiv i64 %45, 2 %47 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42 %48 = load float, float* %47, !invariant.load !0, !noalias !3 %49 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42 %50 = load float, float* %49, !invariant.load !0, !noalias !3 %51 = fmul float %48, %50 %52 = fdiv float %40, %51 %53 = fadd float %32, %52 %54 = fneg float %53 %55 = load float, float* bitcast ([4 x i8]* @1 to float*) %56 = fmul float %54, %55 %57 = getelementptr inbounds [2 x [3 x [4 x float]]], [2 x [3 x [4 x float]]]* %fusion, i64 0, i64 %fusion.indvar.dim.0, i64 %fusion.indvar.dim.1, i64 %fusion.indvar.dim.2 store float %56, float* %57, !alias.scope !8, !noalias !12 %invar.inc5 = add nuw nsw i64 %fusion.indvar.dim.2, 1 store i64 %invar.inc5, i64* %fusion.invar_address.dim.2 br label %fusion.loop_header.dim.2 fusion.loop_exit.dim.2: ; preds = %fusion.loop_header.dim.2 %invar.inc4 = add nuw nsw i64 %fusion.indvar.dim.1, 1 store i64 %invar.inc4, i64* %fusion.invar_address.dim.1 br label %fusion.loop_header.dim.1 fusion.loop_exit.dim.1: ; preds = %fusion.loop_header.dim.1 %invar.inc3 = add nuw nsw i64 %fusion.indvar.dim.0, 1 store i64 %invar.inc3, i64* %fusion.invar_address.dim.0 br label %fusion.loop_header.dim.0 fusion.loop_exit.dim.0: ; preds = %fusion.loop_header.dim.0 %58 = getelementptr inbounds i8*, i8** %buffer_table, i64 3 %59 = load i8*, i8** %58, !invariant.load !0, !dereferenceable !2, !align !2 %tuple.30 = bitcast i8* %59 to [1 x i8*]* %60 = bitcast [2 x [3 x [4 x float]]]* %fusion to i8* %61 = getelementptr inbounds [1 x i8*], [1 x i8*]* %tuple.30, i64 0, i64 0 store i8* %60, i8** %61, !alias.scope !14, !noalias !8 ret void } ; Function Attrs: nounwind readnone speculatable willreturn declare float @llvm.log.f32(float) #1 attributes #0 = { uwtable "no-frame-pointer-elim"="false" } attributes #1 = { nounwind readnone speculatable willreturn } !0 = !{} !1 = !{i64 32} !2 = !{i64 8} !3 = !{!4, !6} !4 = !{!"buffer: {index:0, offset:0, size:96}", !5} !5 = !{!"XLA global AA domain"} !6 = !{!"buffer: {index:5, offset:0, size:32}", !5} !7 = !{!6} !8 = !{!4} !9 = !{i64 4} !10 = !{i64 12} !11 = !{i64 96} !12 = !{!13, !6} !13 = !{!"buffer: {index:3, offset:0, size:8}", !5} !14 = !{!13} ``` gets (correctly) optimized to the one below without the change: ``` ; ModuleID = '__compute_module' source_filename = "__compute_module" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" ; Function Attrs: nofree nounwind uwtable define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 { entry: %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1 %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]** %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2 %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5 %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]** %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2 %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>* %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3 %8 = fmul <4 x float> %7, %7 %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000> %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9) %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>* store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8 %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0 %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0 %14 = bitcast float* %12 to <4 x float>* %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3 %16 = fmul <4 x float> %15, %15 %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000> %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17) %19 = bitcast float* %13 to <4 x float>* store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8 %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4 %21 = bitcast i8** %20 to float** %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2 %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2 %24 = bitcast i8** %23 to [3 x [1 x float]]** %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2 %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2 %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3 %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0 %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3 %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>* %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3 %30 = insertelement <2 x float> undef, float %27, i32 0 %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer %32 = fsub <2 x float> %31, %29 %33 = fmul <2 x float> %32, %32 %shuffle30 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> %34 = fsub float %27, %.pre29 %35 = fmul float %34, %34 %36 = insertelement <4 x float> undef, float %35, i32 0 %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %38 = fmul <4 x float> %7, %7 %shuffle31 = shufflevector <4 x float> %38, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %39 = fdiv <8 x float> %shuffle30, %shuffle31 %40 = fadd <8 x float> %shuffle, %39 %41 = fmul <8 x float> %40, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %42 = bitcast i8* %26 to <8 x float>* store <8 x float> %41, <8 x float>* %42, align 8, !alias.scope !8, !noalias !12 %43 = getelementptr inbounds i8, i8* %26, i64 32 %44 = fdiv <4 x float> %37, %38 %45 = fadd <4 x float> %10, %44 %46 = fmul <4 x float> %45, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %47 = bitcast i8* %43 to <4 x float>* store <4 x float> %46, <4 x float>* %47, align 8, !alias.scope !8, !noalias !12 %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0 %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0 %48 = bitcast float* %.phi.trans.insert to <4 x float>* %49 = load <4 x float>, <4 x float>* %48, align 8, !alias.scope !7, !noalias !8 %50 = bitcast float* %.phi.trans.insert12 to <4 x float>* %51 = load <4 x float>, <4 x float>* %50, align 8, !invariant.load !0, !noalias !3 %shuffle.1 = shufflevector <4 x float> %49, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %52 = getelementptr inbounds i8, i8* %26, i64 48 %53 = fmul <4 x float> %51, %51 %shuffle31.1 = shufflevector <4 x float> %53, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %54 = fdiv <8 x float> %shuffle30, %shuffle31.1 %55 = fadd <8 x float> %shuffle.1, %54 %56 = fmul <8 x float> %55, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %57 = bitcast i8* %52 to <8 x float>* store <8 x float> %56, <8 x float>* %57, align 8, !alias.scope !8, !noalias !12 %58 = getelementptr inbounds i8, i8* %26, i64 80 %59 = fdiv <4 x float> %37, %53 %60 = fadd <4 x float> %49, %59 %61 = fmul <4 x float> %60, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %62 = bitcast i8* %58 to <4 x float>* store <4 x float> %61, <4 x float>* %62, align 8, !alias.scope !8, !noalias !12 %63 = getelementptr inbounds i8*, i8** %buffer_table, i64 3 %64 = bitcast i8** %63 to [1 x i8*]** %65 = load [1 x i8*]*, [1 x i8*]** %64, align 8, !invariant.load !0, !dereferenceable !2, !align !2 %66 = getelementptr inbounds [1 x i8*], [1 x i8*]* %65, i64 0, i64 0 store i8* %26, i8** %66, align 8, !alias.scope !14, !noalias !8 ret void } ; Function Attrs: nounwind readnone speculatable willreturn declare <4 x float> @llvm.log.v4f32(<4 x float>) #1 attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" } attributes #1 = { nounwind readnone speculatable willreturn } !0 = !{} !1 = !{i64 32} !2 = !{i64 8} !3 = !{!4, !6} !4 = !{!"buffer: {index:0, offset:0, size:96}", !5} !5 = !{!"XLA global AA domain"} !6 = !{!"buffer: {index:5, offset:0, size:32}", !5} !7 = !{!6} !8 = !{!4} !9 = !{i64 4} !10 = !{i64 12} !11 = !{i64 96} !12 = !{!13, !6} !13 = !{!"buffer: {index:3, offset:0, size:8}", !5} !14 = !{!13} ``` and (incorrectly) optimized to the one below with the change: ``` ; ModuleID = '__compute_module' source_filename = "__compute_module" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" ; Function Attrs: nofree nounwind uwtable define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 { entry: %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1 %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]** %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2 %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5 %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]** %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2 %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>* %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3 %8 = fmul <4 x float> %7, %7 %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000> %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9) %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>* store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8 %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0 %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0 %14 = bitcast float* %12 to <4 x float>* %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3 %16 = fmul <4 x float> %15, %15 %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000> %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17) %19 = bitcast float* %13 to <4 x float>* store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8 %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4 %21 = bitcast i8** %20 to float** %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2 %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2 %24 = bitcast i8** %23 to [3 x [1 x float]]** %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2 %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2 %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3 %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0 %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3 %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>* %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3 %30 = insertelement <2 x float> undef, float %27, i32 0 %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer %32 = fsub <2 x float> %31, %29 %33 = fmul <2 x float> %32, %32 %shuffle32 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> %34 = fsub float %27, %.pre29 %35 = fmul float %34, %34 %36 = insertelement <4 x float> undef, float %35, i32 0 %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %38 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 0, i64 0, i64 3 %39 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 0, i64 0, i64 3 %40 = fmul <4 x float> %7, %7 %41 = shufflevector <4 x float> %40, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> %42 = fdiv <8 x float> %shuffle32, %41 %43 = fadd <8 x float> %shuffle, %42 %44 = fmul <8 x float> %43, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %45 = bitcast i8* %26 to <8 x float>* store <8 x float> %44, <8 x float>* %45, align 8, !alias.scope !8, !noalias !12 %46 = extractelement <4 x float> %10, i32 0 %47 = getelementptr inbounds i8, i8* %26, i64 32 %48 = extractelement <4 x float> %10, i32 1 %49 = extractelement <4 x float> %10, i32 2 %50 = load float, float* %38, align 4, !alias.scope !7, !noalias !8 %51 = load float, float* %39, align 4, !invariant.load !0, !noalias !3 %52 = fmul float %51, %51 %53 = insertelement <4 x float> undef, float %52, i32 3 %54 = fdiv <4 x float> %37, %53 %55 = insertelement <4 x float> undef, float %46, i32 0 %56 = insertelement <4 x float> %55, float %48, i32 1 %57 = insertelement <4 x float> %56, float %49, i32 2 %58 = insertelement <4 x float> %57, float %50, i32 3 %59 = fadd <4 x float> %58, %54 %60 = fmul <4 x float> %59, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %61 = bitcast i8* %47 to <4 x float>* store <4 x float> %60, <4 x float>* %61, align 8, !alias.scope !8, !noalias !12 %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0 %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0 %62 = bitcast float* %.phi.trans.insert to <4 x float>* %63 = load <4 x float>, <4 x float>* %62, align 8, !alias.scope !7, !noalias !8 %64 = bitcast float* %.phi.trans.insert12 to <4 x float>* %65 = load <4 x float>, <4 x float>* %64, align 8, !invariant.load !0, !noalias !3 %shuffle.1 = shufflevector <4 x float> %63, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %66 = getelementptr inbounds i8, i8* %26, i64 48 %67 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 3 %68 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 3 %69 = fmul <4 x float> %65, %65 %70 = shufflevector <4 x float> %69, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> %71 = fdiv <8 x float> %shuffle32, %70 %72 = fadd <8 x float> %shuffle.1, %71 %73 = fmul <8 x float> %72, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %74 = bitcast i8* %66 to <8 x float>* store <8 x float> %73, <8 x float>* %74, align 8, !alias.scope !8, !noalias !12 %75 = extractelement <4 x float> %69, i32 0 %76 = extractelement <4 x float> %63, i32 0 %77 = getelementptr inbounds i8, i8* %26, i64 80 %78 = extractelement <4 x float> %69, i32 1 %79 = extractelement <4 x float> %63, i32 1 %80 = extractelement <4 x float> %69, i32 2 %81 = extractelement <4 x float> %63, i32 2 %82 = load float, float* %67, align 4, !alias.scope !7, !noalias !8 %83 = load float, float* %68, align 4, !invariant.load !0, !noalias !3 %84 = fmul float %83, %83 %85 = insertelement <4 x float> undef, float %75, i32 0 %86 = insertelement <4 x float> %85, float %78, i32 1 %87 = insertelement <4 x float> %86, float %80, i32 2 %88 = insertelement <4 x float> %87, float %84, i32 3 %89 = fdiv <4 x float> %37, %88 %90 = insertelement <4 x float> undef, float %76, i32 0 %91 = insertelement <4 x float> %90, float %79, i32 1 %92 = insertelement <4 x float> %91, float %81, i32 2 %93 = insertelement <4 x float> %92, float %82, i32 3 %94 = fadd <4 x float> %93, %89 %95 = fmul <4 x float> %94, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> %96 = bitcast i8* %77 to <4 x float>* store <4 x float> %95, <4 x float>* %96, align 8, !alias.scope !8, !noalias !12 %97 = getelementptr inbounds i8*, i8** %buffer_table, i64 3 %98 = bitcast i8** %97 to [1 x i8*]** %99 = load [1 x i8*]*, [1 x i8*]** %98, align 8, !invariant.load !0, !dereferenceable !2, !align !2 %100 = getelementptr inbounds [1 x i8*], [1 x i8*]* %99, i64 0, i64 0 store i8* %26, i8** %100, align 8, !alias.scope !14, !noalias !8 ret void } ; Function Attrs: nounwind readnone speculatable willreturn declare <4 x float> @llvm.log.v4f32(<4 x float>) #1 attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" } attributes #1 = { nounwind readnone speculatable willreturn } !0 = !{} !1 = !{i64 32} !2 = !{i64 8} !3 = !{!4, !6} !4 = !{!"buffer: {index:0, offset:0, size:96}", !5} !5 = !{!"XLA global AA domain"} !6 = !{!"buffer: {index:5, offset:0, size:32}", !5} !7 = !{!6} !8 = !{!4} !9 = !{i64 4} !10 = !{i64 12} !11 = !{i64 96} !12 = !{!13, !6} !13 = !{!"buffer: {index:3, offset:0, size:8}", !5} !14 = !{!13} ``` This results in bad numerical answers when used through XLA. Again, it's not that easy to give a small fully-reproducible example, but the misscompare is: ``` Expected literal: ( f32[2,3,4] { { { nan, -inf, -3181.35, -inf }, { nan, -inf, -28.2577019, -inf }, { nan, -inf, -28.2577019, -inf } }, { { -inf, -inf, -inf, -inf }, { -6.60753046e+28, -1.47314833e+23, -inf, -inf }, { -2.43504347e+30, -5.42892693e+24, -inf, -inf } } } ) Actual literal: ( f32[2,3,4] { { { nan, -inf, -3181.35, -inf }, { nan, -inf, -inf, -inf }, { inf, -inf, -28.2577019, -inf } }, { { -inf, -inf, -inf, -inf }, { -6.60753046e+28, -1.47314833e+23, -inf, -inf }, { -2.43504347e+30, -5.42892693e+24, -inf, -inf } } } ) ``` Reviewers: sanjoy.google, sanjoy, ebrevnov, jdoerfert, reames, chandlerc Subscribers: hiraditya, Charusso, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70516
* [DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" ↵Evgeniy Brevnov2019-11-191-6/+19
| | | | | | | | | | | | | | | | | | | should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151 Summary: Dependence anlysis has a mechanism to cache results. Thus for particular memory access the cache keep track of side effects in basic blocks. The problem is that for invariant loads dependepce analysis legally ignores many dependencies due to a special semantic rules for such loads. But later results calculated for invariant load retrived from the cache for general case acceses. As a result we have wrong dependence information causing GVN to do illegal transformation. Fixes, T42151. Proposed solution is to disable caching of invariant loads. I think such loads a pretty rare and it doesn't make sense to extend caching mechanism for them. Reviewers: reames, chandlerc, skatkov, morisset, jdoerfert Reviewed By: reames Subscribers: hiraditya, test, jdoerfert, lebedev.ri, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64405
* Sink all InitializePasses.h includesReid Kleckner2019-11-131-0/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This file lists every pass in LLVM, and is included by Pass.h, which is very popular. Every time we add, remove, or rename a pass in LLVM, it caused lots of recompilation. I found this fact by looking at this table, which is sorted by the number of times a file was changed over the last 100,000 git commits multiplied by the number of object files that depend on it in the current checkout: recompiles touches affected_files header 342380 95 3604 llvm/include/llvm/ADT/STLExtras.h 314730 234 1345 llvm/include/llvm/InitializePasses.h 307036 118 2602 llvm/include/llvm/ADT/APInt.h 213049 59 3611 llvm/include/llvm/Support/MathExtras.h 170422 47 3626 llvm/include/llvm/Support/Compiler.h 162225 45 3605 llvm/include/llvm/ADT/Optional.h 158319 63 2513 llvm/include/llvm/ADT/Triple.h 140322 39 3598 llvm/include/llvm/ADT/StringRef.h 137647 59 2333 llvm/include/llvm/Support/Error.h 131619 73 1803 llvm/include/llvm/Support/FileSystem.h Before this change, touching InitializePasses.h would cause 1345 files to recompile. After this change, touching it only causes 550 compiles in an incremental rebuild. Reviewers: bkramer, asbirlea, bollu, jdoerfert Differential Revision: https://reviews.llvm.org/D70211
* Change TargetLibraryInfo analysis passes to always require FunctionTeresa Johnson2019-09-071-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Summary: This is the first change to enable the TLI to be built per-function so that -fno-builtin* handling can be migrated to use function attributes. See discussion on D61634 for background. This is an enabler for fixing handling of these options for LTO, for example. This change should not affect behavior, as the provided function is not yet used to build a specifically per-function TLI, but rather enables that migration. Most of the changes were very mechanical, e.g. passing a Function to the legacy analysis pass's getTLI interface, or in Module level cases, adding a callback. This is similar to the way the per-function TTI analysis works. There was one place where we were looking for builtins but not in the context of a specific function. See FindCXAAtExit in lib/Transforms/IPO/GlobalOpt.cpp. I'm somewhat concerned my workaround could provide the wrong behavior in some corner cases. Suggestions welcome. Reviewers: chandlerc, hfinkel Subscribers: arsenm, dschuff, jvesely, nhaehnle, mehdi_amini, javed.absar, sbc100, jgravelle-google, eraman, aheejin, steven_wu, george.burgess.iv, dexonsmith, jfb, asbirlea, gchatelet, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66428 llvm-svn: 371284
* [Instruction] Add hasMetadata(Kind) helper [NFC]Philip Reames2019-09-041-3/+3
| | | | | | It's a common idiom, so let's add the obvious wrapper for metadata kinds which are basically booleans. llvm-svn: 370933
* [MemDep] allow to select block-scan-limit when constructing ↵Fedor Sergeev2019-08-101-5/+8
| | | | | | | | | | | | | MemoryDependenceAnalysis Introducing non-global control for default block-scan-limit in MemDep analysis. Useful when there are many compilations per initialized LLVM instance (e.g. JIT). Reviewed By: asbirlea Tags: #llvm Differential Revision: https://reviews.llvm.org/D65806 llvm-svn: 368502
* [MemDepAnalysis] Allow caller to pass in an OrderedBasicBlock.Florian Hahn2019-03-281-14/+19
| | | | | | | | | | | | | If the caller can preserve the OBB, we can avoid recomputing the order for each getDependency call. Reviewers: efriedma, rnk, hfinkel Reviewed By: rnk Differential Revision: https://reviews.llvm.org/D59788 llvm-svn: 357206
* [NFC] Fix typos: preceeding -> precedingJordan Rupprecht2019-02-231-1/+1
| | | | llvm-svn: 354715
* Update the file headers across all of the LLVM projects in the monorepoChandler Carruth2019-01-191-4/+3
| | | | | | | | | | | | | | | | | to reflect the new license. We understand that people may be surprised that we're moving the header entirely to discuss the new license. We checked this carefully with the Foundation's lawyer and we believe this is the correct approach. Essentially, all code in the project is now made available by the LLVM project under our new license, so you will see that the license headers include that license only. Some of our contributors have contributed code under our old license, and accordingly, we have retained a copy of our old license notice in the top-level files in each project and repository. llvm-svn: 351636
* [CallSite removal] Migrate all Alias Analysis APIs to use the newlyChandler Carruth2019-01-071-25/+22
| | | | | | | | | | | | | | | | | | | | | | | | | | minted `CallBase` class instead of the `CallSite` wrapper. This moves the largest interwoven collection of APIs that traffic in `CallSite`s. While a handful of these could have been migrated with a minorly more shallow migration by converting from a `CallSite` to a `CallBase`, it hardly seemed worth it. Most of the APIs needed to migrate together because of the complex interplay of AA APIs and the fact that converting from a `CallBase` to a `CallSite` isn't free in its current implementation. Out of tree users of these APIs can fairly reliably migrate with some combination of `.getInstruction()` on the `CallSite` instance and casting the resulting pointer. The most generic form will look like `CS` -> `cast_or_null<CallBase>(CS.getInstruction())` but in most cases there is a more elegant migration. Hopefully, this migrates enough APIs for users to fully move from `CallSite` to the base class. All of the in-tree users were easily migrated in that fashion. Thanks for the review from Saleem! Differential Revision: https://reviews.llvm.org/D55641 llvm-svn: 350503
* Revert r348645 - "[MemCpyOpt] memset->memcpy forwarding with undef tail"David L. Jones2018-12-131-6/+0
| | | | | | | This revision caused trucated memsets for structs with padding. See: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20181210/610520.html llvm-svn: 349002
* [MemCpyOpt] memset->memcpy forwarding with undef tailNikita Popov2018-12-071-0/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently memcpyopt optimizes cases like memset(a, byte, N); memcpy(b, a, M); to memset(a, byte, N); memset(b, byte, M); if M <= N. Often this allows further simplifications down the line, which drop the first memset entirely. This patch extends this optimization for the case where M > N, but we know that the bytes a[N..M] are undef due to alloca/lifetime.start. This situation arises relatively often for Rust code, because Rust does not initialize trailing structure padding and loves to insert redundant memcpys. This also fixes https://bugs.llvm.org/show_bug.cgi?id=39844. For the implementation, I'm reusing a bit of code for a similar existing optimization (direct memcpy of undef). I've also added memset support to MemDepAnalysis GetLocation -- Instead, getPointerDependencyFrom could be used, but it seems to make more sense to add this to GetLocation and thus make the computation cachable. Differential Revision: https://reviews.llvm.org/D55120 llvm-svn: 348645
* [Analysis] Make LocationSizes carry an 'imprecise' bitGeorge Burgess IV2018-10-101-15/+30
| | | | | | | | | | | | | | | | | | | | | | There are places where we need to merge multiple LocationSizes of different sizes into one, and get a sensible result. There are other places where we want to optimize aggressively based on the value of a LocationSizes (e.g. how can a store of four bytes be to an area of storage that's only two bytes large?) This patch makes LocationSize hold an 'imprecise' bit to note whether the LocationSize can be treated as an upper-bound and lower-bound for the size of a location, or just an upper-bound. This concludes the series of patches leading up to this. The most recent of which is r344108. Fixes PR36228. Differential Revision: https://reviews.llvm.org/D44748 llvm-svn: 344114
* llvm::sort(C.begin(), C.end(), ...) -> llvm::sort(C, ...)Fangrui Song2018-09-271-3/+3
| | | | | | | | | | | | Summary: The convenience wrapper in STLExtras is available since rL342102. Reviewers: dblaikie, javed.absar, JDevlieghere, andreadb Subscribers: MatzeB, sanjoy, arsenm, dschuff, mehdi_amini, sdardis, nemanjai, jvesely, nhaehnle, sbc100, jgravelle-google, eraman, aheejin, kbarton, JDevlieghere, javed.absar, gbedwell, jrtc27, mgrang, atanasyan, steven_wu, george.burgess.iv, dexonsmith, kristina, jsji, llvm-commits Differential Revision: https://reviews.llvm.org/D52573 llvm-svn: 343163
* [IR] Replace `isa<TerminatorInst>` with `isTerminator()`.Chandler Carruth2018-08-261-1/+1
| | | | | | | | | | | | This is a bit awkward in a handful of places where we didn't even have an instruction and now we have to see if we can build one. But on the whole, this seems like a win and at worst a reasonable cost for removing `TerminatorInst`. All of this is part of the removal of `TerminatorInst` from the `Instruction` type hierarchy. llvm-svn: 340701
* [MemDep] Use PhiValuesAnalysis to improve alias analysis resultsJohn Brawn2018-07-311-3/+14
| | | | | | | | | | This is being done in order to make GVN able to better optimize certain inputs. MemDep doesn't use PhiValues directly, but does need to notifiy it when things get invalidated. Differential Revision: https://reviews.llvm.org/D48489 llvm-svn: 338384
* Dissallow non-empty metadata for invariant.groupPiotr Padlewski2018-05-181-3/+3
| | | | | | | | | | | | | | | Summary: This feature is not needed, but it might be usefull in the future to use metadata to mark what which function should support it (and strip it when not). Reviewers: rsmith, sanjoy, amharc, kuhar Subscribers: hiraditya, llvm-commits Differential Revision: https://reviews.llvm.org/D45419 llvm-svn: 332787
* [MemDep] Fixed handling of invariant.groupPiotr Padlewski2018-05-181-5/+26
| | | | | | | | | | | | | | | | | | Summary: Memdep had funny bug related to invariant.groups - because it did not invalidated cache, in some very rare cases it was possible to show memory dependence of the instruction that was deleted, but because other instruction took it's place it resulted in call to vtable! Thanks @amharc for repro!. Reviewers: dberlin, kuhar, amharc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D45320 Co-authored-by: Krzysztof Pszeniczny <krzysztof.pszeniczny@gmail.com> llvm-svn: 332781
* Rename DEBUG macro to LLVM_DEBUG.Nicola Zaghen2018-05-141-6/+6
| | | | | | | | | | | | | | | | The DEBUG() macro is very generic so it might clash with other projects. The renaming was done as follows: - git grep -l 'DEBUG' | xargs sed -i 's/\bDEBUG\s\?(/LLVM_DEBUG(/g' - git diff -U0 master | ../clang/tools/clang-format/clang-format-diff.py -i -p1 -style LLVM - Manual change to APInt - Manually chage DOCS as regex doesn't match it. In the transition period the DEBUG() macro is still present and aliased to the LLVM_DEBUG() one. Differential Revision: https://reviews.llvm.org/D43624 llvm-svn: 332240
* [Analysis] Change std::sort to llvm::sort in response to r327219Mandeep Singh Grang2018-04-011-3/+3
| | | | | | | | | | | | | | | | | | | | | | Summary: r327219 added wrappers to std::sort which randomly shuffle the container before sorting. This will help in uncovering non-determinism caused due to undefined sorting order of objects having the same key. To make use of that infrastructure we need to invoke llvm::sort instead of std::sort. Note: This patch is one of a series of patches to replace *all* std::sort to llvm::sort. Refer D44363 for a list of all the required patches. Reviewers: sanjoy, dexonsmith, hfinkel, RKSimon Reviewed By: dexonsmith Subscribers: david2050, llvm-commits Differential Revision: https://reviews.llvm.org/D44944 llvm-svn: 328925
* [MDA] Use common code instead of reimplementing same. [NFC]Philip Reames2018-01-171-10/+2
| | | | llvm-svn: 322747
* Revert "[memcpyopt] Teach memcpyopt to optimize across basic blocks"Reid Kleckner2017-12-281-11/+9
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This reverts r321138. It seems there are still underlying issues with memdep. PR35519 seems to still be present if debug info is enabled. We end up losing a memcpy. Somehow during store to memset merging, we insert the memset after the memcpy or fail to update the memdep analysis to account for the newly inserted memset of a pair. Reduced test case: #include <assert.h> #include <stdio.h> #include <string> #include <utility> #include <vector> void do_push_back( std::vector<std::pair<std::string, std::vector<std::string>>>* crls) { crls->push_back(std::make_pair(std::string(), std::vector<std::string>())); } int __attribute__((optnone)) main() { // Put some data in the vector and then remove it so we take the push_back // fast path. std::vector<std::pair<std::string, std::vector<std::string>>> crl_set; crl_set.push_back({"asdf", {}}); crl_set.pop_back(); printf("first word in vector storage: %p\n", *(void**)crl_set.data()); // Do the push_back which may fail to initialize the data. do_push_back(&crl_set); auto* first = &crl_set.back().first; printf("first word in vector storage (should be zero): %p\n", *(void**)crl_set.data()); assert(first->empty()); puts("ok"); } Compile with libc++, enable optimizations, and enable debug info: $ clang++ -stdlib=libc++ -g -O2 t.cpp -o t.exe -Wl,-rpath=llvm/build/lib This program will assert with this change. llvm-svn: 321510
* [ModRefInfo] Add must alias info to ModRefInfo.Alina Sbirlea2017-12-211-1/+2
| | | | | | | | | | | | | | | | | | | | | | Summary: Add an additional bit to ModRefInfo, ModRefInfo::Must, to be cleared for known must aliases. Shift existing Mod/Ref/ModRef values to include an additional most significant bit. Update wrappers that modify ModRefInfo values to reflect the change. Notes: * ModRefInfo::Must is almost entirely cleared in the AAResults methods, the remaining changes are trying to preserve it. * Only some small changes to make custom AA passes set ModRefInfo::Must (BasicAA). * GlobalsModRef already declares a bit, who's meaning overlaps with the most significant bit in ModRefInfo (MayReadAnyGlobal). No changes to shift the value of MayReadAnyGlobal (see AlignedMap). FunctionInfo.getModRef() ajusts most significant bit so correctness is preserved, but the Must info is lost. * There are cases where the ModRefInfo::Must is not set, e.g. 2 calls that only read will return ModRefInfo::NoModRef, though they may read from exactly the same location. Reviewers: dberlin, hfinkel, george.burgess.iv Subscribers: llvm-commits, sanjoy Differential Revision: https://reviews.llvm.org/D38862 llvm-svn: 321309
* [memcpyopt] Teach memcpyopt to optimize across basic blocksDan Gohman2017-12-201-9/+11
| | | | | | | | | | | | | | | | | This teaches memcpyopt to make a non-local memdep query when a local query indicates that the dependency is non-local. This notably allows it to eliminate many more llvm.memcpy calls in common Rust code, often by 20-30%. This is r319482 and r319483, along with fixes for PR35519: fix the optimization that merges stores into memsets to preserve cached memdep info, and fix memdep's non-local caching strategy to not assume that larger queries are always more conservative than smaller ones. Fixes PR28958 and PR35519. Differential Revision: https://reviews.llvm.org/D40802 llvm-svn: 321138
* Hardware-assisted AddressSanitizer (llvm part).Evgeniy Stepanov2017-12-091-2/+4
| | | | | | | | | | | | | | | | | | | | | Summary: This is LLVM instrumentation for the new HWASan tool. It is basically a stripped down copy of ASan at this point, w/o stack or global support. Instrumenation adds a global constructor + runtime callbacks for every load and store. HWASan comes with its own IR attribute. A brief design document can be found in clang/docs/HardwareAssistedAddressSanitizerDesign.rst (submitted earlier). Reviewers: kcc, pcc, alekseyshl Subscribers: srhines, mehdi_amini, mgorny, javed.absar, eraman, llvm-commits, hiraditya Differential Revision: https://reviews.llvm.org/D40932 llvm-svn: 320217
* [ModRefInfo] Make enum ModRefInfo an enum class [NFC].Alina Sbirlea2017-12-071-16/+16
| | | | | | | | | | | | | | | Summary: Make enum ModRefInfo an enum class. Changes to ModRefInfo values should be done using inline wrappers. This should prevent future bit-wise opearations from being added, which can be more error-prone. Reviewers: sanjoy, dberlin, hfinkel, george.burgess.iv Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D40933 llvm-svn: 320107
* [ModRefInfo] Replace remaining bit-wise operations with wrappers.Alina Sbirlea2017-12-071-1/+1
| | | | llvm-svn: 319993
* Revert r319482 and r319483 "[memcpyopt] Teach memcpyopt to optimize across ↵Hans Wennborg2017-12-061-8/+0
| | | | | | | | | | | | | | | | | | | | | | | | basic blocks" This caused PR35519. > [memcpyopt] Teach memcpyopt to optimize across basic blocks > > This teaches memcpyopt to make a non-local memdep query when a local query > indicates that the dependency is non-local. This notably allows it to > eliminate many more llvm.memcpy calls in common Rust code, often by 20-30%. > > Fixes PR28958. > > Differential Revision: https://reviews.llvm.org/D38374 > > [memcpyopt] Commit file missed in r319482. > > This change was meant to be included with r319482 but was accidentally > omitted. llvm-svn: 319873
* Modify ModRefInfo values using static inline method abstractions [NFC].Alina Sbirlea2017-12-051-9/+7
| | | | | | | | | | | | | | | | | Summary: The aim is to make ModRefInfo checks and changes more intuitive and less error prone using inline methods that abstract the bit operations. Ideally ModRefInfo would become an enum class, but that change will require a wider set of changes into FunctionModRefBehavior. Reviewers: sanjoy, george.burgess.iv, dberlin, hfinkel Subscribers: nlopes, llvm-commits Differential Revision: https://reviews.llvm.org/D40749 llvm-svn: 319821
* [memcpyopt] Teach memcpyopt to optimize across basic blocksDan Gohman2017-11-301-0/+8
| | | | | | | | | | | | This teaches memcpyopt to make a non-local memdep query when a local query indicates that the dependency is non-local. This notably allows it to eliminate many more llvm.memcpy calls in common Rust code, often by 20-30%. Fixes PR28958. Differential Revision: https://reviews.llvm.org/D38374 llvm-svn: 319482
* [MemDep] DBG intrinsics don't impact abort limit for call site dependence ↵Mikael Holmen2017-10-251-5/+5
| | | | | | | | | | | | | | | | | | | | analysis Summary: Memory dependence analysis no longer counts DbgInfoIntrinsics towards the limit where to abort the analysis. Before, a bunch of calls to dbg.value could affect the generated code, meaning that with -g we could generate different code than without. Reviewers: chandlerc, Prazek, davide, efriedma Reviewed By: efriedma Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39181 llvm-svn: 316551
* [Analysis] Fix some Clang-tidy modernize and Include What You Use warnings; ↵Eugene Zelenko2017-08-161-8/+16
| | | | | | other minor fixes (NFC). llvm-svn: 311048
* [MemDep] Cleanup return after else & use `auto`. NFC.Davide Italiano2017-06-251-3/+3
| | | | llvm-svn: 306255
* Sort the remaining #include lines in include/... and lib/....Chandler Carruth2017-06-061-3/+3
| | | | | | | | | | | | | | | | | | | | | | | | | I did this a long time ago with a janky python script, but now clang-format has built-in support for this. I fed clang-format every line with a #include and let it re-sort things according to the precise LLVM rules for include ordering baked into clang-format these days. I've reverted a number of files where the results of sorting includes isn't healthy. Either places where we have legacy code relying on particular include ordering (where possible, I'll fix these separately) or where we have particular formatting around #include lines that I didn't want to disturb in this patch. This patch is *entirely* mechanical. If you get merge conflicts or anything, just ignore the changes in this patch and run clang-format over your #include lines in the files. Sorry for any noise here, but it is important to keep these things stable. I was seeing an increasing number of patches with irrelevant re-ordering of #include lines because clang-format was used. This patch at least isolates that churn, makes it easy to skip when resolving conflicts, and gets us to a clean baseline (again). llvm-svn: 304787
* Added LLVM_FALLTHROUGH to address warning: this statement may fall through. NFC.Galina Kistanova2017-05-311-0/+1
| | | | llvm-svn: 304356
* [Devirtualization] MemDep returns non-local !invariant.group dependenciesPiotr Padlewski2017-01-121-8/+55
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Summary: Memory Dependence Analysis was limited to return only local dependencies for invariant.group handling. Now it returns NonLocal when it finds it and then by asking getNonLocalPointerDependency we get found dep. Thanks to this we are able to devirtualize loops! void indirect(A &a, int n) { for (int i = 0 ; i < n; i++) a.foo(); } void test(int n) { A a; indirect(a); } After inlining a.foo() will be changed to direct call, even if foo and A::A() is external (but only if vtable definition is be available). Reviewers: nlewycky, dberlin, chandlerc, rsmith Subscribers: mehdi_amini, davide, llvm-commits Differential Revision: https://reviews.llvm.org/D28137 llvm-svn: 291762
* [MemDep] NFC variable name changePiotr Padlewski2017-01-111-3/+3
| | | | llvm-svn: 291679
* [MemDep] NFC walk invariant.group graph only downPiotr Padlewski2017-01-081-26/+16
| | | | | | | | | | | | | | Summary: By using stripPointerCasts we can get to the root value and then walk down the bitcast graph Reviewers: reames Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D28181 llvm-svn: 291405
* [MemDep] Handle gep with zeros for invariant.groupPiotr Padlewski2016-12-301-20/+39
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Summary: gep 0, 0 is equivalent to bitcast. LLVM canonicalizes it to getelementptr because it make SROA can then handle it. Simple case like void g(A &a) { z(a); if (glob) a.foo(); } void testG() { A a; g(a); } was not devirtualized with -fstrict-vtable-pointers because luck of handling for gep 0 in Memory Dependence Analysis Reviewers: dberlin, nlewycky, chandlerc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D28126 llvm-svn: 290763
* [PM] Teach MemDep to invalidate its result object when its cachedChandler Carruth2016-12-271-0/+18
| | | | | | | | analysis handles become invalid. Add a test case for its invalidation logic. llvm-svn: 290620
* [MemDep] Operand visited twice bugfixPiotr Padlewski2016-12-271-0/+1
| | | | | | | | Because operand was not marked as seen it was visited twice. It doesn't change behavior of optimization, it just saves redudant visit, so no test changes. llvm-svn: 290607
* [MemDep] NFC changesPiotr Padlewski2016-12-231-2/+1
| | | | llvm-svn: 290428
* Revert @llvm.assume with operator bundles (r289755-r289757)Daniel Jasper2016-12-191-3/+8
| | | | | | | This creates non-linear behavior in the inliner (see more details in r289755's commit thread). llvm-svn: 290086
* Remove the AssumptionCacheHal Finkel2016-12-151-8/+3
| | | | | | | | | After r289755, the AssumptionCache is no longer needed. Variables affected by assumptions are now found by using the new operand-bundle-based scheme. This new scheme is more computationally efficient, and also we need much less code... llvm-svn: 289756
* [PM] Change the static object whose address is used to uniquely identifyChandler Carruth2016-11-231-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | analyses to have a common type which is enforced rather than using a char object and a `void *` type when used as an identifier. This has a number of advantages. First, it at least helps some of the confusion raised in Justin Lebar's code review of why `void *` was being used everywhere by having a stronger type that connects to documentation about this. However, perhaps more importantly, it addresses a serious issue where the alignment of these pointer-like identifiers was unknown. This made it hard to use them in pointer-like data structures. We were already dodging this in dangerous ways to create the "all analyses" entry. In a subsequent patch I attempted to use these with TinyPtrVector and things fell apart in a very bad way. And it isn't just a compile time or type system issue. Worse than that, the actual alignment of these pointer-like opaque identifiers wasn't guaranteed to be a useful alignment as they were just characters. This change introduces a type to use as the "key" object whose address forms the opaque identifier. This both forces the objects to have proper alignment, and provides type checking that we get it right everywhere. It also makes the types somewhat less mysterious than `void *`. We could go one step further and introduce a truly opaque pointer-like type to return from the `ID()` static function rather than returning `AnalysisKey *`, but that didn't seem to be a clear win so this is just the initial change to get to a reliably typed and aligned object serving is a key for all the analyses. Thanks to Richard Smith and Justin Lebar for helping pick plausible names and avoid making this refactoring many times. =] And thanks to Sean for the super fast review! While here, I've tried to move away from the "PassID" nomenclature entirely as it wasn't really helping and is overloaded with old pass manager constructs. Now we have IDs for analyses, and key objects whose address can be used as IDs. Where possible and clear I've shortened this to just "ID". In a few places I kept "AnalysisID" to make it clear what was being identified. Differential Revision: https://reviews.llvm.org/D27031 llvm-svn: 287783
* NFC small changes in MemDepPiotr Padlewski2016-11-081-3/+3
| | | | llvm-svn: 286260
* Test commit access (NFC)Henric Karlsson2016-10-061-1/+1
| | | | llvm-svn: 283439
* Do not widen load for different variable in GVN.Dehao Chen2016-09-091-37/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Summary: Widening load in GVN is too early because it will block other optimizations like PRE, LICM. https://llvm.org/bugs/show_bug.cgi?id=29110 The SPECCPU2006 benchmark impact of this patch: Reference: o2_nopatch (1): o2_patched Benchmark Base:Reference (1) ------------------------------------------------------- spec/2006/fp/C++/444.namd 25.2 -0.08% spec/2006/fp/C++/447.dealII 45.92 +1.05% spec/2006/fp/C++/450.soplex 41.7 -0.26% spec/2006/fp/C++/453.povray 35.65 +1.68% spec/2006/fp/C/433.milc 23.79 +0.42% spec/2006/fp/C/470.lbm 41.88 -1.12% spec/2006/fp/C/482.sphinx3 47.94 +1.67% spec/2006/int/C++/471.omnetpp 22.46 -0.36% spec/2006/int/C++/473.astar 21.19 +0.24% spec/2006/int/C++/483.xalancbmk 36.09 -0.11% spec/2006/int/C/400.perlbench 33.28 +1.35% spec/2006/int/C/401.bzip2 22.76 -0.04% spec/2006/int/C/403.gcc 32.36 +0.12% spec/2006/int/C/429.mcf 41.04 -0.41% spec/2006/int/C/445.gobmk 26.94 +0.04% spec/2006/int/C/456.hmmer 24.5 -0.20% spec/2006/int/C/458.sjeng 28 -0.46% spec/2006/int/C/462.libquantum 55.25 +0.27% spec/2006/int/C/464.h264ref 45.87 +0.72% geometric mean +0.23% For most benchmarks, it's a wash, but we do see stable improvements on some benchmarks, e.g. 447,453,482,400. Reviewers: davidxl, hfinkel, dberlin, sanjoy, reames Subscribers: gberry, junbuml Differential Revision: https://reviews.llvm.org/D24096 llvm-svn: 281074
* limit the number of instructions per block examined by dead store eliminationBob Haarman2016-08-261-6/+17
| | | | | | | | | | | | Summary: Dead store elimination gets very expensive when large numbers of instructions need to be analyzed. This patch limits the number of instructions analyzed per store to the value of the memdep-block-scan-limit parameter (which defaults to 100). This resulted in no observed difference in performance of the generated code, and no change in the statistics for the dead store elimination pass, but improved compilation time on some files by more than an order of magnitude. Reviewers: dexonsmith, bruno, george.burgess.iv, dberlin, reames, davidxl Subscribers: davide, chandlerc, dberlin, davidxl, eraman, tejohnson, mbodart, llvm-commits Differential Revision: https://reviews.llvm.org/D15537 llvm-svn: 279833
OpenPOWER on IntegriCloud