clang/Lex/IdentifierTable.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269

//===--- Preprocess.cpp - C Language Family Preprocessor Implementation ---===//
//
//                     The LLVM Compiler Infrastructure
//
// This file was developed by Chris Lattner and is distributed under
// the University of Illinois Open Source License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the IdentifierTokenInfo and IdentifierTable interfaces.
//
//===----------------------------------------------------------------------===//

#include "clang/Lex/IdentifierTable.h"
#include "clang/Lex/MacroInfo.h"
#include <iostream>
using namespace llvm;
using namespace clang;

//===----------------------------------------------------------------------===//
// IdentifierTokenInfo Implementation
//===----------------------------------------------------------------------===//

void IdentifierTokenInfo::Destroy() {
  delete Macro;
}


//===----------------------------------------------------------------------===//
// Memory Allocation Support
//===----------------------------------------------------------------------===//

/// The identifier table has a very simple memory allocation pattern: it just
/// keeps allocating identifiers, then never frees them unless it frees them
/// all.  As such, we use a simple bump-pointer memory allocator to make
/// allocation speedy.  Shark showed that malloc was 27% of the time spent in
/// IdentifierTable::getIdentifier with malloc, and takes a 4.3% time with this.
#define USE_ALLOCATOR 1
#if USE_ALLOCATOR

namespace {
class MemRegion {
  unsigned RegionSize;
  MemRegion *Next;
  char *NextPtr;
public:
  void Init(unsigned size, MemRegion *next) {
    RegionSize = size;
    Next = next;
    NextPtr = (char*)(this+1);
    
    // FIXME: uses GCC extension.
    unsigned Alignment = __alignof__(IdentifierTokenInfo);
    NextPtr = (char*)((intptr_t)(NextPtr+Alignment-1) &
                      ~(intptr_t)(Alignment-1));
  }
  
  const MemRegion *getNext() const { return Next; }
  unsigned getNumBytesAllocated() const {
    return NextPtr-(const char*)this;
  }
  
  /// Allocate - Allocate and return at least the specified number of bytes.
  ///
  void *Allocate(unsigned AllocSize, MemRegion **RegPtr) {
    // FIXME: uses GCC extension.
    unsigned Alignment = __alignof__(IdentifierTokenInfo);
    // Round size up to an even multiple of the alignment.
    AllocSize = (AllocSize+Alignment-1) & ~(Alignment-1);
    
    // If there is space in this region for the identifier, return it.
    if (unsigned(NextPtr+AllocSize-(char*)this) <= RegionSize) {
      void *Result = NextPtr;
      NextPtr += AllocSize;
      return Result;
    }
    
    // Otherwise, we have to allocate a new chunk.  Create one twice as big as
    // this one.
    MemRegion *NewRegion = (MemRegion *)malloc(RegionSize*2);
    NewRegion->Init(RegionSize*2, this);

    // Update the current "first region" pointer  to point to the new region.
    *RegPtr = NewRegion;
    
    // Try allocating from it now.
    return NewRegion->Allocate(AllocSize, RegPtr);
  }
  
  /// Deallocate - Release all memory for this region to the system.
  ///
  void Deallocate() {
    MemRegion *next = Next;
    free(this);
    if (next)
      next->Deallocate();
  }
};
}

#endif

//===----------------------------------------------------------------------===//
// IdentifierTable Implementation
//===----------------------------------------------------------------------===//


/// IdentifierLink - There is one of these allocated by IdentifierTokenInfo.
/// These form the linked list of buckets for the hash table.
struct IdentifierBucket {
  /// Next - This is the next bucket in the linked list.
  IdentifierBucket *Next;
  
  IdentifierTokenInfo TokInfo;
  // NOTE: TokInfo must be the last element in this structure, as the string
  // information for the identifier is allocated right after it.
};

// FIXME: start hashtablesize off at 8K entries, GROW when density gets to 3.
static unsigned HASH_TABLE_SIZE = 8096;

IdentifierTable::IdentifierTable() {
  IdentifierBucket **TableArray = new IdentifierBucket*[HASH_TABLE_SIZE]();
  TheTable = TableArray;
  NumIdentifiers = 0;
#if USE_ALLOCATOR
  TheMemory = malloc(8*4096);
  ((MemRegion*)TheMemory)->Init(8*4096, 0);
#endif
  
  memset(TheTable, 0, HASH_TABLE_SIZE*sizeof(IdentifierBucket*));
}

IdentifierTable::~IdentifierTable() {
  IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
  for (unsigned i = 0, e = HASH_TABLE_SIZE; i != e; ++i) {
    IdentifierBucket *Id = TableArray[i]; 
    while (Id) {
      // Free memory referenced by the identifier (e.g. macro info).
      Id->TokInfo.Destroy();
      
      IdentifierBucket *Next = Id->Next;
#if !USE_ALLOCATOR
      free(Id);
#endif
      Id = Next;
    }
  }
#if USE_ALLOCATOR
  ((MemRegion*)TheMemory)->Deallocate();
#endif
  delete [] TableArray;
}

/// HashString - Compute a hash code for the specified string.
///
static unsigned HashString(const char *Start, const char *End) {
  unsigned int Result = 0;
  // Perl hash function.
  while (Start != End)
    Result = Result * 33 + *Start++;
  Result = Result + (Result >> 5);
  return Result;
}

IdentifierTokenInfo &IdentifierTable::get(const char *NameStart,
                                          const char *NameEnd) {
  IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;

  unsigned Hash = HashString(NameStart, NameEnd) % HASH_TABLE_SIZE;
  unsigned Length = NameEnd-NameStart;
  
  IdentifierBucket *IdentHead = TableArray[Hash];
  for (IdentifierBucket *Identifier = IdentHead; Identifier; 
       Identifier = Identifier->Next) {
    if (Identifier->TokInfo.getNameLength() == Length &&
        memcmp(Identifier->TokInfo.getName(), NameStart, Length) == 0)
      return Identifier->TokInfo;
  }

  // Allocate a new identifier, with space for the null-terminated string at the
  // end.
  unsigned AllocSize = sizeof(IdentifierBucket)+Length+1;
#if USE_ALLOCATOR
  IdentifierBucket *Identifier = (IdentifierBucket*)
    ((MemRegion*)TheMemory)->Allocate(AllocSize, (MemRegion**)&TheMemory);
#else
  IdentifierBucket *Identifier = (IdentifierBucket*)malloc(AllocSize);
#endif
  Identifier->TokInfo.NameLen = Length;
  Identifier->TokInfo.Macro = 0;
  Identifier->TokInfo.TokenID = tok::identifier;
  Identifier->TokInfo.IsExtension = false;
  Identifier->TokInfo.IsPoisoned = false;
  Identifier->TokInfo.FETokenInfo = 0;

  // Copy the string information.
  char *StrBuffer = (char*)(Identifier+1);
  memcpy(StrBuffer, NameStart, Length);
  StrBuffer[Length] = 0;  // Null terminate string.
  
  // Link it into the hash table.
  Identifier->Next = IdentHead;
  TableArray[Hash] = Identifier;
  return Identifier->TokInfo;
}

IdentifierTokenInfo &IdentifierTable::get(const std::string &Name) {
  // Don't use c_str() here: no need to be null terminated.
  const char *NameBytes = &Name[0];
  unsigned Size = Name.size();
  return get(NameBytes, NameBytes+Size);
}


/// PrintStats - Print statistics about how well the identifier table is doing
/// at hashing identifiers.
void IdentifierTable::PrintStats() const {
  unsigned NumIdentifiers = 0;
  unsigned NumEmptyBuckets = 0;
  unsigned MaxBucketLength = 0;
  unsigned AverageIdentifierSize = 0;
  unsigned MaxIdentifierLength = 0;
  
  IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
  for (unsigned i = 0, e = HASH_TABLE_SIZE; i != e; ++i) {
    
    unsigned NumIdentifiersInBucket = 0;
    for (IdentifierBucket *Id = TableArray[i]; Id; Id = Id->Next) {
      AverageIdentifierSize += Id->TokInfo.getNameLength();
      if (MaxIdentifierLength < Id->TokInfo.getNameLength())
        MaxIdentifierLength = Id->TokInfo.getNameLength();
      ++NumIdentifiersInBucket;
    }
    if (NumIdentifiersInBucket > MaxBucketLength) 
      MaxBucketLength = NumIdentifiersInBucket;
    if (NumIdentifiersInBucket == 0)
      ++NumEmptyBuckets;

    NumIdentifiers += NumIdentifiersInBucket;
  }
  
  std::cerr << "\n*** Identifier Table Stats:\n";
  std::cerr << "# Identifiers:   " << NumIdentifiers << "\n";
  std::cerr << "# Empty Buckets: " << NumEmptyBuckets << "\n";
  std::cerr << "Max identifiers in one bucket: " << MaxBucketLength << "\n";
  std::cerr << "Hash density (#identifiers per bucket): "
            << NumIdentifiers/(double)HASH_TABLE_SIZE << "\n";
  std::cerr << "Nonempty hash density (average chain length): "
            << NumIdentifiers/(double)(HASH_TABLE_SIZE-NumEmptyBuckets) << "\n";
  std::cerr << "Ave identifier length: "
            << (AverageIdentifierSize/(double)NumIdentifiers) << "\n";
  std::cerr << "Max identifier length: " << MaxIdentifierLength << "\n";
  
  // Compute statistics about the memory allocated for identifiers.
#if USE_ALLOCATOR
  unsigned BytesUsed = 0;
  unsigned NumRegions = 0;
  const MemRegion *R = (MemRegion*)TheMemory;
  for (; R; R = R->getNext(), ++NumRegions) {
    BytesUsed += R->getNumBytesAllocated();
  }
  std::cerr << "\nNumber of memory regions: " << NumRegions << "\n";
  std::cerr << "Bytes allocated for identifiers: " << BytesUsed << "\n";
#endif
}