compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

/*===- DataFlow.cpp - a standalone DataFlow tracer                  -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// An experimental data-flow tracer for fuzz targets.
// It is based on DFSan and SanitizerCoverage.
// https://clang.llvm.org/docs/DataFlowSanitizer.html
// https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow
//
// It executes the fuzz target on the given input while monitoring the
// data flow for every instrumented comparison instruction.
//
// The output shows which functions depend on which bytes of the input,
// and also provides basic-block coverage for every input.
//
// Build:
//   1. Compile this file with -fsanitize=dataflow
//   2. Build the fuzz target with -g -fsanitize=dataflow
//       -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp
//   3. Link those together with -fsanitize=dataflow
//
//  -fsanitize-coverage=trace-cmp inserts callbacks around every comparison
//  instruction, DFSan modifies the calls to pass the data flow labels.
//  The callbacks update the data flow label for the current function.
//  See e.g. __dfsw___sanitizer_cov_trace_cmp1 below.
//
//  -fsanitize-coverage=trace-pc-guard,pc-table,bb instruments function
//  entries so that the comparison callback knows that current function.
//  -fsanitize-coverage=...,bb also allows to collect basic block coverage.
//
//
// Run:
//   # Collect data flow and coverage for INPUT_FILE
//   # write to OUTPUT_FILE (default: stdout)
//   ./a.out FIRST_LABEL LAST_LABEL INPUT_FILE [OUTPUT_FILE]
//
//   # Print all instrumented functions. llvm-symbolizer must be present in PATH
//   ./a.out
//
// Example output:
// ===============
//  F0 11111111111111
//  F1 10000000000000
//  C0 1 2 3 4
//  C1
//  ===============
// "FN xxxxxxxxxx": tells what bytes of the input does the function N depend on.
//    The byte string is LEN+1 bytes. The last byte is set if the function
//    depends on the input length.
// "CN X Y Z": tells that a function N has basic blocks X, Y, and Z covered
//    in addition to the function's entry block.
//
//===----------------------------------------------------------------------===*/

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#include <execinfo.h>  // backtrace_symbols_fd

#include <sanitizer/dfsan_interface.h>

extern "C" {
extern int LLVMFuzzerTestOneInput(const unsigned char *Data, size_t Size);
__attribute__((weak)) extern int LLVMFuzzerInitialize(int *argc, char ***argv);
} // extern "C"

static size_t InputLen;
static size_t InputLabelBeg;
static size_t InputLabelEnd;
static size_t InputSizeLabel;
static size_t NumFuncs, NumGuards;
static uint32_t *GuardsBeg, *GuardsEnd;
static const uintptr_t *PCsBeg, *PCsEnd;
static __thread size_t CurrentFunc;
static dfsan_label *FuncLabels;  // Array of NumFuncs elements.
static bool *BBExecuted;  // Array of NumGuards elements.
static char *PrintableStringForLabel;  // InputLen + 2 bytes.
static bool LabelSeen[1 << 8 * sizeof(dfsan_label)];

enum {
  PCFLAG_FUNC_ENTRY = 1,
};

// Prints all instrumented functions.
static int PrintFunctions() {
  // We don't have the symbolizer integrated with dfsan yet.
  // So use backtrace_symbols_fd and pipe it through llvm-symbolizer.
  // TODO(kcc): this is pretty ugly and may break in lots of ways.
  //      We'll need to make a proper in-process symbolizer work with DFSan.
  FILE *Pipe = popen("sed 's/(+/ /g; s/).*//g' "
                     "| llvm-symbolizer "
                     "| grep 'dfs\\$' "
                     "| sed 's/dfs\\$//g'", "w");
  for (size_t I = 0; I < NumGuards; I++) {
    uintptr_t PC = PCsBeg[I * 2];
    uintptr_t PCFlags = PCsBeg[I * 2 + 1];
    if (!(PCFlags & PCFLAG_FUNC_ENTRY)) continue;
    void *const Buf[1] = {(void*)PC};
    backtrace_symbols_fd(Buf, 1, fileno(Pipe));
  }
  pclose(Pipe);
  return 0;
}

extern "C"
void SetBytesForLabel(dfsan_label L, char *Bytes) {
  if (LabelSeen[L])
    return;
  LabelSeen[L] = true;
  assert(L);
  if (L < InputSizeLabel) {
    Bytes[L + InputLabelBeg - 1] = '1';
  } else if (L == InputSizeLabel) {
    Bytes[InputLen] = '1';
  } else {
    auto *DLI = dfsan_get_label_info(L);
    SetBytesForLabel(DLI->l1, Bytes);
    SetBytesForLabel(DLI->l2, Bytes);
  }
}

static char *GetPrintableStringForLabel(dfsan_label L) {
  memset(PrintableStringForLabel, '0', InputLen + 1);
  PrintableStringForLabel[InputLen + 1] = 0;
  memset(LabelSeen, 0, sizeof(LabelSeen));
  SetBytesForLabel(L, PrintableStringForLabel);
  return PrintableStringForLabel;
}

static void PrintDataFlow(FILE *Out) {
  for (size_t I = 0; I < NumFuncs; I++)
    if (FuncLabels[I])
      fprintf(Out, "F%zd %s\n", I, GetPrintableStringForLabel(FuncLabels[I]));
}

static void PrintCoverage(FILE *Out) {
  ssize_t CurrentFuncGuard = -1;
  ssize_t CurrentFuncNum = -1;
  int NumFuncsCovered = 0;
  for (size_t I = 0; I < NumGuards; I++) {
    bool IsEntry = PCsBeg[I * 2 + 1] & PCFLAG_FUNC_ENTRY;
    if (IsEntry) {
      CurrentFuncNum++;
      CurrentFuncGuard = I;
    }
    if (!BBExecuted[I]) continue;
    if (IsEntry) {
      if (NumFuncsCovered) fprintf(Out, "\n");
      fprintf(Out, "C%zd ", CurrentFuncNum);
      NumFuncsCovered++;
    } else {
      fprintf(Out, "%zd ", I - CurrentFuncGuard);
    }
  }
  fprintf(Out, "\n");
}

int main(int argc, char **argv) {
  if (LLVMFuzzerInitialize)
    LLVMFuzzerInitialize(&argc, &argv);
  if (argc == 1)
    return PrintFunctions();
  assert(argc == 4 || argc == 5);
  InputLabelBeg = atoi(argv[1]);
  InputLabelEnd = atoi(argv[2]);
  assert(InputLabelBeg < InputLabelEnd);

  const char *Input = argv[3];
  fprintf(stderr, "INFO: reading '%s'\n", Input);
  FILE *In = fopen(Input, "r");
  assert(In);
  fseek(In, 0, SEEK_END);
  InputLen = ftell(In);
  fseek(In, 0, SEEK_SET);
  unsigned char *Buf = (unsigned char*)malloc(InputLen);
  size_t NumBytesRead = fread(Buf, 1, InputLen, In);
  assert(NumBytesRead == InputLen);
  PrintableStringForLabel = (char*)malloc(InputLen + 2);
  fclose(In);

  fprintf(stderr, "INFO: running '%s'\n", Input);
  for (size_t I = 1; I <= InputLen; I++) {
    size_t Idx = I - 1;
    if (Idx >= InputLabelBeg && Idx < InputLabelEnd) {
      dfsan_label L = dfsan_create_label("", nullptr);
      assert(L == I - InputLabelBeg);
      dfsan_set_label(L, Buf + Idx, 1);
    }
  }
  dfsan_label SizeL = dfsan_create_label("", nullptr);
  InputSizeLabel = SizeL;
  assert(InputSizeLabel == InputLabelEnd - InputLabelBeg + 1);
  dfsan_set_label(SizeL, &InputLen, sizeof(InputLen));

  LLVMFuzzerTestOneInput(Buf, InputLen);
  free(Buf);

  bool OutIsStdout = argc == 4;
  fprintf(stderr, "INFO: writing dataflow to %s\n",
          OutIsStdout ? "<stdout>" : argv[4]);
  FILE *Out = OutIsStdout ? stdout : fopen(argv[4], "w");
  PrintDataFlow(Out);
  PrintCoverage(Out);
  if (!OutIsStdout) fclose(Out);
}

extern "C" {

void __sanitizer_cov_trace_pc_guard_init(uint32_t *start,
                                         uint32_t *stop) {
  assert(NumFuncs == 0 && "This tool does not support DSOs");
  assert(start < stop && "The code is not instrumented for coverage");
  if (start == stop || *start) return;  // Initialize only once.
  GuardsBeg = start;
  GuardsEnd = stop;
}

void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
                              const uintptr_t *pcs_end) {
  if (NumGuards) return;  // Initialize only once.
  NumGuards = GuardsEnd - GuardsBeg;
  PCsBeg = pcs_beg;
  PCsEnd = pcs_end;
  assert(NumGuards == (PCsEnd - PCsBeg) / 2);
  for (size_t i = 0; i < NumGuards; i++) {
    if (PCsBeg[i * 2 + 1] & PCFLAG_FUNC_ENTRY) {
      NumFuncs++;
      GuardsBeg[i] = NumFuncs;
    }
  }
  FuncLabels = (dfsan_label*)calloc(NumFuncs, sizeof(dfsan_label));
  BBExecuted = (bool*)calloc(NumGuards, sizeof(bool));
  fprintf(stderr, "INFO: %zd instrumented function(s) observed "
          "and %zd basic blocks\n", NumFuncs, NumGuards);
}

void __sanitizer_cov_trace_pc_indir(uint64_t x){}  // unused.

void __sanitizer_cov_trace_pc_guard(uint32_t *guard) {
  size_t GuardIdx = guard - GuardsBeg;
  assert(GuardIdx < NumGuards);
  BBExecuted[GuardIdx] = true;
  if (!*guard) return;  // not a function entry.
  uint32_t FuncNum = *guard - 1;  // Guards start from 1.
  assert(FuncNum < NumFuncs);
  CurrentFunc = FuncNum;
}

void __dfsw___sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases,
                                         dfsan_label L1, dfsan_label UnusedL) {
  assert(CurrentFunc < NumFuncs);
  FuncLabels[CurrentFunc] = dfsan_union(FuncLabels[CurrentFunc], L1);
}

#define HOOK(Name, Type)                                                       \
  void Name(Type Arg1, Type Arg2, dfsan_label L1, dfsan_label L2) {            \
    assert(CurrentFunc < NumFuncs);                                            \
    FuncLabels[CurrentFunc] =                                                  \
        dfsan_union(FuncLabels[CurrentFunc], dfsan_union(L1, L2));             \
  }

HOOK(__dfsw___sanitizer_cov_trace_const_cmp1, uint8_t)
HOOK(__dfsw___sanitizer_cov_trace_const_cmp2, uint16_t)
HOOK(__dfsw___sanitizer_cov_trace_const_cmp4, uint32_t)
HOOK(__dfsw___sanitizer_cov_trace_const_cmp8, uint64_t)
HOOK(__dfsw___sanitizer_cov_trace_cmp1, uint8_t)
HOOK(__dfsw___sanitizer_cov_trace_cmp2, uint16_t)
HOOK(__dfsw___sanitizer_cov_trace_cmp4, uint32_t)
HOOK(__dfsw___sanitizer_cov_trace_cmp8, uint64_t)

} // extern "C"