llvm/lib/Fuzzer/FuzzerTraceState.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379

//===- FuzzerTraceState.cpp - Trace-based fuzzer mutator ------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This file implements a mutation algorithm based on instruction traces and
// on taint analysis feedback from DFSan.
//
// Instruction traces are special hooks inserted by the compiler around
// interesting instructions. Currently supported traces:
//   * __sanitizer_cov_trace_cmp -- inserted before every ICMP instruction,
//    receives the type, size and arguments of ICMP.
//
// Every time a traced event is intercepted we analyse the data involved
// in the event and suggest a mutation for future executions.
// For example if 4 bytes of data that derive from input bytes {4,5,6,7}
// are compared with a constant 12345,
// we try to insert 12345, 12344, 12346 into bytes
// {4,5,6,7} of the next fuzzed inputs.
//
// The fuzzer can work only with the traces, or with both traces and DFSan.
//
// DataFlowSanitizer (DFSan) is a tool for
// generalised dynamic data flow (taint) analysis:
// http://clang.llvm.org/docs/DataFlowSanitizer.html .
//
// The approach with DFSan-based fuzzing has some similarity to
// "Taint-based Directed Whitebox Fuzzing"
// by Vijay Ganesh & Tim Leek & Martin Rinard:
// http://dspace.mit.edu/openaccess-disseminate/1721.1/59320,
// but it uses a full blown LLVM IR taint analysis and separate instrumentation
// to analyze all of the "attack points" at once.
//
// Workflow with DFSan:
//   * lib/Fuzzer/Fuzzer*.cpp is compiled w/o any instrumentation.
//   * The code under test is compiled with DFSan *and* with instruction traces.
//   * Every call to HOOK(a,b) is replaced by DFSan with
//     __dfsw_HOOK(a, b, label(a), label(b)) so that __dfsw_HOOK
//     gets all the taint labels for the arguments.
//   * At the Fuzzer startup we assign a unique DFSan label
//     to every byte of the input string (Fuzzer::CurrentUnit) so that for any
//     chunk of data we know which input bytes it has derived from.
//   * The __dfsw_* functions (implemented in this file) record the
//     parameters (i.e. the application data and the corresponding taint labels)
//     in a global state.
//   * Fuzzer::ApplyTraceBasedMutation() tries to use the data recorded
//     by __dfsw_* hooks to guide the fuzzing towards new application states.
//
// Parts of this code will not function when DFSan is not linked in.
// Instead of using ifdefs and thus requiring a separate build of lib/Fuzzer
// we redeclare the dfsan_* interface functions as weak and check if they
// are nullptr before calling.
// If this approach proves to be useful we may add attribute(weak) to the
// dfsan declarations in dfsan_interface.h
//
// This module is in the "proof of concept" stage.
// It is capable of solving only the simplest puzzles
// like test/dfsan/DFSanSimpleCmpTest.cpp.
//===----------------------------------------------------------------------===//

/* Example of manual usage (-fsanitize=dataflow is optional):
(
  cd $LLVM/lib/Fuzzer/
  clang  -fPIC -c -g -O2 -std=c++11 Fuzzer*.cpp
  clang++ -O0 -std=c++11 -fsanitize-coverage=edge,trace-cmp \
    -fsanitize=dataflow \
    test/dfsan/DFSanSimpleCmpTest.cpp Fuzzer*.o
  ./a.out
)
*/

#include "FuzzerInternal.h"
#include <sanitizer/dfsan_interface.h>

#include <algorithm>
#include <cstring>
#include <iostream>
#include <unordered_map>

extern "C" {
__attribute__((weak))
dfsan_label dfsan_create_label(const char *desc, void *userdata);
__attribute__((weak))
void dfsan_set_label(dfsan_label label, void *addr, size_t size);
__attribute__((weak))
void dfsan_add_label(dfsan_label label, void *addr, size_t size);
__attribute__((weak))
const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
__attribute__((weak))
dfsan_label dfsan_read_label(const void *addr, size_t size);
}  // extern "C"

namespace fuzzer {

static bool ReallyHaveDFSan() {
  return &dfsan_create_label != nullptr;
}

// These values are copied from include/llvm/IR/InstrTypes.h.
// We do not include the LLVM headers here to remain independent.
// If these values ever change, an assertion in ComputeCmp will fail.
enum Predicate {
  ICMP_EQ = 32,  ///< equal
  ICMP_NE = 33,  ///< not equal
  ICMP_UGT = 34, ///< unsigned greater than
  ICMP_UGE = 35, ///< unsigned greater or equal
  ICMP_ULT = 36, ///< unsigned less than
  ICMP_ULE = 37, ///< unsigned less or equal
  ICMP_SGT = 38, ///< signed greater than
  ICMP_SGE = 39, ///< signed greater or equal
  ICMP_SLT = 40, ///< signed less than
  ICMP_SLE = 41, ///< signed less or equal
};

template <class U, class S>
bool ComputeCmp(size_t CmpType, U Arg1, U Arg2) {
  switch(CmpType) {
    case ICMP_EQ : return Arg1 == Arg2;
    case ICMP_NE : return Arg1 != Arg2;
    case ICMP_UGT: return Arg1 > Arg2;
    case ICMP_UGE: return Arg1 >= Arg2;
    case ICMP_ULT: return Arg1 < Arg2;
    case ICMP_ULE: return Arg1 <= Arg2;
    case ICMP_SGT: return (S)Arg1 > (S)Arg2;
    case ICMP_SGE: return (S)Arg1 >= (S)Arg2;
    case ICMP_SLT: return (S)Arg1 < (S)Arg2;
    case ICMP_SLE: return (S)Arg1 <= (S)Arg2;
    default: assert(0 && "unsupported CmpType");
  }
  return false;
}

static bool ComputeCmp(size_t CmpSize, size_t CmpType, uint64_t Arg1,
                       uint64_t Arg2) {
  if (CmpSize == 8) return ComputeCmp<uint64_t, int64_t>(CmpType, Arg1, Arg2);
  if (CmpSize == 4) return ComputeCmp<uint32_t, int32_t>(CmpType, Arg1, Arg2);
  if (CmpSize == 2) return ComputeCmp<uint16_t, int16_t>(CmpType, Arg1, Arg2);
  if (CmpSize == 1) return ComputeCmp<uint8_t, int8_t>(CmpType, Arg1, Arg2);
  assert(0 && "unsupported type size");
  return true;
}

// As a simplification we use the range of input bytes instead of a set of input
// bytes.
struct LabelRange {
  uint16_t Beg, End;  // Range is [Beg, End), thus Beg==End is an empty range.

  LabelRange(uint16_t Beg = 0, uint16_t End = 0) : Beg(Beg), End(End) {}

  static LabelRange Join(LabelRange LR1, LabelRange LR2) {
    if (LR1.Beg == LR1.End) return LR2;
    if (LR2.Beg == LR2.End) return LR1;
    return {std::min(LR1.Beg, LR2.Beg), std::max(LR1.End, LR2.End)};
  }
  LabelRange &Join(LabelRange LR) {
    return *this = Join(*this, LR);
  }
  static LabelRange Singleton(const dfsan_label_info *LI) {
    uint16_t Idx = (uint16_t)(uintptr_t)LI->userdata;
    assert(Idx > 0);
    return {(uint16_t)(Idx - 1), Idx};
  }
};

std::ostream &operator<<(std::ostream &os, const LabelRange &LR) {
  return os << "[" << LR.Beg << "," << LR.End << ")";
}

// For now, very simple: put Size bytes of Data at position Pos.
struct TraceBasedMutation {
  size_t Pos;
  size_t Size;
  uint64_t Data;
};

class TraceState {
 public:
   TraceState(const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
       : Options(Options), CurrentUnit(CurrentUnit) {}

  LabelRange GetLabelRange(dfsan_label L);
  void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                        uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
                        dfsan_label L2);
  void TraceCmpCallback(size_t CmpSize, size_t CmpType, uint64_t Arg1,
                        uint64_t Arg2);
  int TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
                           size_t DataSize);

  void StartTraceRecording() {
    if (!Options.UseTraces) return;
    RecordingTraces = true;
    Mutations.clear();
  }

  size_t StopTraceRecording() {
    RecordingTraces = false;
    std::random_shuffle(Mutations.begin(), Mutations.end());
    return Mutations.size();
  }

  void ApplyTraceBasedMutation(size_t Idx, fuzzer::Unit *U);

 private:
  bool IsTwoByteData(uint64_t Data) {
    int64_t Signed = static_cast<int64_t>(Data);
    Signed >>= 16;
    return Signed == 0 || Signed == -1L;
  }
  bool RecordingTraces = false;
  std::vector<TraceBasedMutation> Mutations;
  LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)] = {};
  const Fuzzer::FuzzingOptions &Options;
  const Unit &CurrentUnit;
};

LabelRange TraceState::GetLabelRange(dfsan_label L) {
  LabelRange &LR = LabelRanges[L];
  if (LR.Beg < LR.End || L == 0)
    return LR;
  const dfsan_label_info *LI = dfsan_get_label_info(L);
  if (LI->l1 || LI->l2)
    return LR = LabelRange::Join(GetLabelRange(LI->l1), GetLabelRange(LI->l2));
  return LR = LabelRange::Singleton(LI);
}

void TraceState::ApplyTraceBasedMutation(size_t Idx, fuzzer::Unit *U) {
  assert(Idx < Mutations.size());
  auto &M = Mutations[Idx];
  if (Options.Verbosity >= 3)
    std::cerr << "TBM " << M.Pos << " " << M.Size << " " << M.Data << "\n";
  if (M.Pos + M.Size > U->size()) return;
  memcpy(U->data() + M.Pos, &M.Data, M.Size);
}

void TraceState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                                  uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
                                  dfsan_label L2) {
  assert(ReallyHaveDFSan());
  if (!RecordingTraces) return;
  if (L1 == 0 && L2 == 0)
    return;  // Not actionable.
  if (L1 != 0 && L2 != 0)
    return;  // Probably still actionable.
  bool Res = ComputeCmp(CmpSize, CmpType, Arg1, Arg2);
  uint64_t Data = L1 ? Arg2 : Arg1;
  LabelRange LR = L1 ? GetLabelRange(L1) : GetLabelRange(L2);

  for (size_t Pos = LR.Beg; Pos + CmpSize <= LR.End; Pos++) {
    Mutations.push_back({Pos, CmpSize, Data});
    Mutations.push_back({Pos, CmpSize, Data + 1});
    Mutations.push_back({Pos, CmpSize, Data - 1});
  }

  if (CmpSize > LR.End - LR.Beg)
    Mutations.push_back({LR.Beg, (unsigned)(LR.End - LR.Beg), Data});


  if (Options.Verbosity >= 3)
    std::cerr << "DFSAN:"
              << " PC " << std::hex << PC << std::dec
              << " S " << CmpSize
              << " T " << CmpType
              << " A1 " << Arg1 << " A2 " << Arg2 << " R " << Res
              << " L" << L1
              << " L" << L2
              << " R"  << LR
              << " MU " << Mutations.size()
              << "\n";
}

int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
                                    size_t DataSize) {
  int Res = 0;
  const uint8_t *Beg = CurrentUnit.data();
  const uint8_t *End = Beg + CurrentUnit.size();
  for (const uint8_t *Cur = Beg; Cur < End; Cur += DataSize) {
    Cur = (uint8_t *)memmem(Cur, End - Cur, &PresentData, DataSize);
    if (!Cur)
      break;
    // std::cerr << "Cur " << (void*)Cur << "\n";
    size_t Pos = Cur - Beg;
    assert(Pos < CurrentUnit.size());
    Mutations.push_back({Pos, DataSize, DesiredData});
    Mutations.push_back({Pos, DataSize, DesiredData + 1});
    Mutations.push_back({Pos, DataSize, DesiredData - 1});
    Cur += DataSize;
    Res++;
  }
  return Res;
}

void TraceState::TraceCmpCallback(size_t CmpSize, size_t CmpType, uint64_t Arg1,
                        uint64_t Arg2) {
  if (!RecordingTraces) return;
  int Added = 0;
  if (Options.Verbosity >= 3)
    std::cerr << "TraceCmp: " << Arg1 << " " << Arg2 << "\n";
  Added += TryToAddDesiredData(Arg1, Arg2, CmpSize);
  Added += TryToAddDesiredData(Arg2, Arg1, CmpSize);
  if (!Added && CmpSize == 4 && IsTwoByteData(Arg1) && IsTwoByteData(Arg2)) {
    Added += TryToAddDesiredData(Arg1, Arg2, 2);
    Added += TryToAddDesiredData(Arg2, Arg1, 2);
  }
}

static TraceState *TS;

void Fuzzer::StartTraceRecording() {
  if (!TS) return;
  TS->StartTraceRecording();
}

size_t Fuzzer::StopTraceRecording() {
  if (!TS) return 0;
  return TS->StopTraceRecording();
}

void Fuzzer::ApplyTraceBasedMutation(size_t Idx, Unit *U) {
  assert(TS);
  TS->ApplyTraceBasedMutation(Idx, U);
}

void Fuzzer::InitializeTraceState() {
  if (!Options.UseTraces && !Options.UseDFSan) return;
  TS = new TraceState(Options, CurrentUnit);
  CurrentUnit.resize(Options.MaxLen);
  // The rest really requires DFSan.
  if (!ReallyHaveDFSan() || !Options.UseDFSan) return;
  for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++) {
    dfsan_label L = dfsan_create_label("input", (void*)(i + 1));
    // We assume that no one else has called dfsan_create_label before.
    assert(L == i + 1);
    dfsan_set_label(L, &CurrentUnit[i], 1);
  }
}

}  // namespace fuzzer

using fuzzer::TS;

extern "C" {
void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
                                      uint64_t Arg2, dfsan_label L0,
                                      dfsan_label L1, dfsan_label L2) {
  assert(TS);
  assert(L0 == 0);
  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
  uint64_t CmpSize = (SizeAndType >> 32) / 8;
  uint64_t Type = (SizeAndType << 32) >> 32;
  TS->DFSanCmpCallback(PC, CmpSize, Type, Arg1, Arg2, L1, L2);
}

void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
                            size_t n, dfsan_label s1_label,
                            dfsan_label s2_label, dfsan_label n_label) {
  assert(TS);
  uintptr_t PC = reinterpret_cast<uintptr_t>(caller_pc);
  uint64_t S1 = 0, S2 = 0;
  // Simplification: handle only first 8 bytes.
  memcpy(&S1, s1, std::min(n, sizeof(S1)));
  memcpy(&S2, s2, std::min(n, sizeof(S2)));
  dfsan_label L1 = dfsan_read_label(s1, n);
  dfsan_label L2 = dfsan_read_label(s2, n);
  TS->DFSanCmpCallback(PC, n, fuzzer::ICMP_EQ, S1, S2, L1, L2);
}

void __sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
                               uint64_t Arg2) {
  if (!TS) return;
  uint64_t CmpSize = (SizeAndType >> 32) / 8;
  uint64_t Type = (SizeAndType << 32) >> 32;
  TS->TraceCmpCallback(CmpSize, Type, Arg1, Arg2);
}

}  // extern "C"