[libFuzzer] Fix DataFlow.cpp logic when tracing long inputs.

Summary: 1. Do not create DFSan labels for the bytes which we do not trace. This is where we run out of labels at the first place. 2. When dumping the traces on the disk, make sure to offset the label identifiers by the number of the first byte in the trace range. 3. For the last label, make sure to write it at the last position of the trace bit string, as that label represents the input size, not any particular byte. Also fixed the bug with division in python which I've introduced when migrated the scripts to Python3 (`//` is required for integral division). Otherwise, the scripts are wasting too much time unsuccessfully trying to collect and process traces from the long inputs. For more context, see https://github.com/google/oss-fuzz/issues/1632#issuecomment-481761789 Reviewers: kcc Reviewed By: kcc Subscribers: delcypher, #sanitizers, llvm-commits Tags: #llvm, #sanitizers Differential Revision: https://reviews.llvm.org/D60538 llvm-svn: 358311
author: Max Moroz <mmoroz@chromium.org> 2019-04-12 21:00:12 +0000
committer: Max Moroz <mmoroz@chromium.org> 2019-04-12 21:00:12 +0000
commit: b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb (patch)
tree: b157a324eb0d9219f0c09b16245c0297c48321c0
parent: 9e27514996e7dd183cd84b59656ac64b7514776f (diff)
download: bcm5719-llvm-b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb.tar.gz
bcm5719-llvm-b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb.zip
3 files changed, 29 insertions, 11 deletions
diff --git a/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp b/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp
index 742e956e784..187a8e52cd5 100644
--- a/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp
+++ b/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp
@@ -63,6 +63,9 @@ __attribute__((weak)) extern int LLVMFuzzerInitialize(int *argc, char ***argv);
 } // extern "C"
 
 static size_t InputLen;
+static size_t InputLabelBeg;
+static size_t InputLabelEnd;
+static size_t InputSizeLabel;
 static size_t NumFuncs;
 static const uintptr_t *FuncsBeg;
 static __thread size_t CurrentFunc;
@@ -95,8 +98,10 @@ void SetBytesForLabel(dfsan_label L, char *Bytes) {
     return;
   LabelSeen[L] = true;
   assert(L);
-  if (L <= InputLen + 1) {
-    Bytes[L - 1] = '1';
+  if (L < InputSizeLabel) {
+    Bytes[L + InputLabelBeg - 1] = '1';
+  } else if (L == InputSizeLabel) {
+    Bytes[InputLen] = '1';
   } else {
     auto *DLI = dfsan_get_label_info(L);
     SetBytesForLabel(DLI->l1, Bytes);
@@ -124,9 +129,9 @@ int main(int argc, char **argv) {
   if (argc == 1)
     return PrintFunctions();
   assert(argc == 4 || argc == 5);
-  size_t Beg = atoi(argv[1]);
-  size_t End = atoi(argv[2]);
-  assert(Beg < End);
+  InputLabelBeg = atoi(argv[1]);
+  InputLabelEnd = atoi(argv[2]);
+  assert(InputLabelBeg < InputLabelEnd);
 
   const char *Input = argv[3];
   fprintf(stderr, "INFO: reading '%s'\n", Input);
@@ -143,14 +148,16 @@ int main(int argc, char **argv) {
 
   fprintf(stderr, "INFO: running '%s'\n", Input);
   for (size_t I = 1; I <= InputLen; I++) {
-    dfsan_label L = dfsan_create_label("", nullptr);
-    assert(L == I);
     size_t Idx = I - 1;
-    if (Idx >= Beg && Idx < End)
+    if (Idx >= InputLabelBeg && Idx < InputLabelEnd) {
+      dfsan_label L = dfsan_create_label("", nullptr);
+      assert(L == I - InputLabelBeg);
       dfsan_set_label(L, Buf + Idx, 1);
+    }
   }
   dfsan_label SizeL = dfsan_create_label("", nullptr);
-  assert(SizeL == InputLen + 1);
+  InputSizeLabel = SizeL;
+  assert(InputSizeLabel == InputLabelEnd - InputLabelBeg + 1);
   dfsan_set_label(SizeL, &InputLen, sizeof(InputLen));
 
   LLVMFuzzerTestOneInput(Buf, InputLen);
diff --git a/compiler-rt/lib/fuzzer/scripts/collect_data_flow.py b/compiler-rt/lib/fuzzer/scripts/collect_data_flow.py
index e8b56a71910..bd601eb63ab 100755
--- a/compiler-rt/lib/fuzzer/scripts/collect_data_flow.py
+++ b/compiler-rt/lib/fuzzer/scripts/collect_data_flow.py
@@ -65,8 +65,8 @@ def main(argv):
     tmpfile = os.path.join(tmpdir, str(r[0]) + "-" + str(r[1]))
     ret = subprocess.call([exe, str(r[0]), str(r[1]), inp, tmpfile])
     if ret and r[1] - r[0] >= 2:
-      q.append([r[0], (r[1] + r[0]) / 2])
-      q.append([(r[1] + r[0]) / 2, r[1]])
+      q.append([r[0], (r[1] + r[0]) // 2])
+      q.append([(r[1] + r[0]) // 2, r[1]])
     else:
       outputs.append(tmpfile)
       print("******* Success: ", r)
diff --git a/compiler-rt/test/fuzzer/dataflow.test b/compiler-rt/test/fuzzer/dataflow.test
index 64f083735cb..36ec55881e8 100644
--- a/compiler-rt/test/fuzzer/dataflow.test
+++ b/compiler-rt/test/fuzzer/dataflow.test
@@ -82,3 +82,14 @@ USE_DATA_FLOW_TRACE: INFO: DataFlowTrace: reading from {{.*}}/OUT
 USE_DATA_FLOW_TRACE-DAG: a8eefe2fd5d6b32028f355fafa3e739a6bf5edc => |000001|
 USE_DATA_FLOW_TRACE-DGA: d28cb407e8e1a702c72d25473f0553d3ec172262 => |0000011|
 USE_DATA_FLOW_TRACE: INFO: DataFlowTrace: 6 trace files, 3 functions, 2 traces with focus function
+
+# Test that we can run collect_data_flow on a long input (>2**16 bytes)
+RUN: rm -rf %t/OUT
+RUN: printf "%0.sA" {1..150001} > %t/IN/very_long_input
+RUN: %libfuzzer_src/scripts/collect_data_flow.py %t-ThreeFunctionsTestDF %t/IN/very_long_input %t/OUT | FileCheck %s --check-prefix=COLLECT_TRACE_FOR_LONG_INPUT
+RUN: rm %t/IN/very_long_input
+COLLECT_TRACE_FOR_LONG_INPUT: ******* Trying:{{[ ]+}}[0, 150001]
+COLLECT_TRACE_FOR_LONG_INPUT: ******* Trying:{{[ ]+}}[75000, 150001]
+COLLECT_TRACE_FOR_LONG_INPUT: ******* Trying:{{[ ]+}}[112500, 150001]
+COLLECT_TRACE_FOR_LONG_INPUT: ******* Success:{{[ ]+}}[{{[0123456789]+}}, 150001]
+COLLECT_TRACE_FOR_LONG_INPUT: ******* Success:{{[ ]+}}[0, {{[0123456789]+}}]
author	Max Moroz <mmoroz@chromium.org>	2019-04-12 21:00:12 +0000
committer	Max Moroz <mmoroz@chromium.org>	2019-04-12 21:00:12 +0000
commit	b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb (patch)
tree	b157a324eb0d9219f0c09b16245c0297c48321c0
parent	9e27514996e7dd183cd84b59656ac64b7514776f (diff)
download	bcm5719-llvm-b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb.tar.gz bcm5719-llvm-b6e6d3c740a4b94a64ad62745a18571f1a9cb3cb.zip