summaryrefslogtreecommitdiff
path: root/libcilkrts/runtime/frame_malloc.c
blob: 9cbea45d22cd82bae1e405bd2b3c3a3b204c04c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
/* frame_malloc.c                  -*-C-*-
 *
 *************************************************************************
 *
 *  Copyright (C) 2009-2016, Intel Corporation
 *  All rights reserved.
 *  
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the
 *      distribution.
 *    * Neither the name of Intel Corporation nor the names of its
 *      contributors may be used to endorse or promote products derived
 *      from this software without specific prior written permission.
 *  
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *  
 *  *********************************************************************
 *  
 *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
 *  a repository at cilkplus.org. Changes made to this file that are not
 *  submitted through the contribution process detailed at
 *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
 *  time that a new version is released. Changes only submitted to the
 *  GNU compiler collection or posted to the git repository at
 *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
 *  not tracked.
 *  
 *  We welcome your contributions to this open source project. Thank you
 *  for your assistance in helping us improve Cilk Plus.
 **************************************************************************/

#include "frame_malloc.h"
#include "bug.h"
#include "local_state.h"
#include "cilk_malloc.h"

#ifndef __VXWORKS__
#include <memory.h>
#endif

/* #define USE_MMAP 1 */ 
#if USE_MMAP
#define __USE_MISC 1
#include <sys/mman.h>
#include <errno.h>
#endif

// Define to fill the stack frame header with the fill character when pushing
// it on a free list.  Note that this should be #ifdef'd out when checked in!

#ifdef _DEBUG
#define HEADER_FILL_CHAR 0xbf
#endif

// HEADER_FILL_CHAR should not be defined when checked in, so put out a warning
// message if this is a release build

#if defined(NDEBUG) && defined (HEADER_FILL_CHAR)
#pragma message ("Warning: HEADER_FILL_CHAR defined for a release build")
#endif

static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size);

#ifndef _WIN32

const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] =
{
    64, 128, 256, 512, 1024, 2048
};

#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket]

/* threshold above which we use slow malloc */
#define FRAME_MALLOC_MAX_SIZE 2048

#else // _WIN32

/* Note that this must match the implementation of framesz_to_bucket in
 * asmilator/layout.ml! */
#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket)))

/* threshold above which we use slow malloc */
#define FRAME_MALLOC_MAX_SIZE                                   \
    FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1)

#endif // _WIN32

/* utility procedures */
static void push(struct free_list **b, struct free_list *p)
{
#ifdef HEADER_FILL_CHAR
    memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0));
#endif
    /* cons! onto free list */
    p->cdr = *b;
    *b = p;
}

static struct free_list *pop(struct free_list **b)
{
    struct free_list *p = *b;
    if (p) 
        *b = p->cdr;
    return p;
}

/*************************************************************
  global allocator:
*************************************************************/
/* request slightly less than 2^K from the OS, which after malloc
   overhead and alignment should end up filling each VM page almost
   completely.  128 is a guess of the total malloc overhead and cache
   line alignment */
#define FRAME_MALLOC_CHUNK (32 * 1024 - 128)

/** Implements linked list of frames */
struct pool_cons {
    char *p;                /**< This element of the list */
    struct pool_cons *cdr;  /**< Remainder of the list */
};

static void extend_global_pool(global_state_t *g)
{
    /* FIXME: memalign to a cache line? */
    struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c));
    g->frame_malloc.pool_begin = 
        (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK);
    g->frame_malloc.pool_end = 
        g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK;
    g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK;
    c->p = g->frame_malloc.pool_begin;
    c->cdr = g->frame_malloc.pool_list;
    g->frame_malloc.pool_list = c;
}

/* the size is already canonicalized at this point */
static struct free_list *global_alloc(global_state_t *g, int bucket)
{
    struct free_list *mem;
    size_t size;

    CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
    size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
    g->frame_malloc.allocated_from_global_pool += size;

    if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) {

        CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end);
        if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) {
            /* We waste the fragment of pool. */
            g->frame_malloc.wasted +=
                g->frame_malloc.pool_end - g->frame_malloc.pool_begin;
            extend_global_pool(g);
        }
        mem = (struct free_list *)g->frame_malloc.pool_begin;
        g->frame_malloc.pool_begin += size;
    }

    return mem;
}

static void global_free(global_state_t *g, void *mem, int bucket)
{
    size_t size;

    CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
    size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
    g->frame_malloc.allocated_from_global_pool -= size;

    push(&g->frame_malloc.global_free_list[bucket], mem);
}

void __cilkrts_frame_malloc_global_init(global_state_t *g)
{
    int i;

    __cilkrts_mutex_init(&g->frame_malloc.lock); 
    g->frame_malloc.check_for_leaks = 1;
    g->frame_malloc.pool_list = 0;
    g->frame_malloc.pool_begin = 0;
    g->frame_malloc.pool_end = 0;
    g->frame_malloc.batch_size = 8000;
    g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size;
    g->frame_malloc.allocated_from_os = 0;
    g->frame_malloc.allocated_from_global_pool = 0;
    g->frame_malloc.wasted = 0;
    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) 
        g->frame_malloc.global_free_list[i] = 0;
}

// Counts how many bytes are in the global free list.
static size_t count_memory_in_global_list(global_state_t *g)
{

    // Count the memory remaining in the global free list.
    size_t size_remaining_in_global_list = 0;
    int i;
    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
        struct free_list *p;
        size_t size_in_bucket = 0;
        p = g->frame_malloc.global_free_list[i];

        while (p) {
            size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i);
            p = p->cdr;
        }
        size_remaining_in_global_list += size_in_bucket;
    }
    return size_remaining_in_global_list;
}


void __cilkrts_frame_malloc_global_cleanup(global_state_t *g)
{
    struct pool_cons *c;

    if (g->frame_malloc.check_for_leaks) {
        size_t memory_in_global_list = count_memory_in_global_list(g);
        // TBD: This check is weak.  Short of memory corruption,
        // I don't see how we have more memory in the free list
        // than allocated from the os.
        // Ideally, we should count the memory in the global free list
        // and check that we have it all.  But I believe the runtime
        // itself also uses some memory, which is not being tracked.
        if (memory_in_global_list > g->frame_malloc.allocated_from_os) {
            __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n");
        }
    }
    
    while ((c = g->frame_malloc.pool_list)) {
        g->frame_malloc.pool_list = c->cdr;
        __cilkrts_free(c->p);
        __cilkrts_free(c);
    }

    __cilkrts_mutex_destroy(0, &g->frame_malloc.lock);

    // Check that all the memory moved from the global pool into
    // workers has been returned to the global pool.
    if (g->frame_malloc.check_for_leaks
        && (g->frame_malloc.allocated_from_global_pool != 0))
    {
        __cilkrts_bug("\n"
                      "---------------------------" "\n"
                      "  MEMORY LEAK DETECTED!!!  " "\n"
                      "---------------------------" "\n"
                      "\n"
            );
    }
}

/*************************************************************
  per-worker allocator
*************************************************************/
/* allocate a batch of frames of size SIZE from the global pool and
   store them in the worker's free list */
static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size)
{
    global_state_t *g = w->g;

    __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
#if USE_MMAP
        char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        if (p == MAP_FAILED)
            __cilkrts_bug("mmap failed %d", errno);
        assert(size < 4096);
        assert(p != MAP_FAILED);
        mprotect(p, 4096, PROT_NONE);
        mprotect(p + 8192, 4096, PROT_NONE);
        w->l->bucket_potential[bucket] += size;
        push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size));
#else
        size_t bytes_allocated = 0;
        do {
            w->l->bucket_potential[bucket] += size;
            bytes_allocated += size;
            push(&w->l->free_list[bucket], global_alloc(g, bucket));
        } while (bytes_allocated < g->frame_malloc.batch_size);
#endif
    } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);

}

static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size)
{
    struct free_list *p, *q;
    global_state_t *g = w->g;
    size_t pot = w->l->bucket_potential[bucket];
    size_t newpot;

    /* Keep up to POT/2 elements in the free list.  The cost of
       counting up to POT/2 is amortized against POT. */
    newpot = 0;
    for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot; 
         p = p->cdr, newpot += size)
        ;
    w->l->bucket_potential[bucket] = newpot;

    if (p) {
        /* free the rest of the list.  The cost of grabbing the lock
           is amortized against POT/2; the cost of traversing the rest
           of the list is amortized against the free operation that
           puts the element on the list. */
        __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
            while ((q = pop(&p->cdr)))
#if USE_MMAP
                munmap((char *)q + size - 8192, 12288);
#else
                global_free(g, q, bucket);
#endif
        } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
    }
}

// Free all the memory in this bucket for the specified worker,
// returning it to the global pool's free list.
static void move_bucket_to_global_free_list(__cilkrts_worker *w,
                                            int bucket)
{
    struct free_list *p, *q;
    global_state_t *g = w->g;
    p = w->l->free_list[bucket];
    
    if (p) {
        __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
            while ((q = pop(&p))) {
#if USE_MMAP
                size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
                munmap((char *)q + size - 8192, 12288);
#else
                global_free(g, q, bucket);
#endif
            }
        } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
    }

    // I'm not sure this does anything useful now, since
    // the worker is about to be destroyed. But why not?
    w->l->bucket_potential[bucket] = 0;
}

static int bucket_of_size(size_t size)
{
    int i;

    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
        if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i))
            return i;

    CILK_ASSERT(0 /* can't happen */);
    return -1;
}

size_t __cilkrts_frame_malloc_roundup(size_t size)
{
    if (size > FRAME_MALLOC_MAX_SIZE) {
        /* nothing, leave it alone */
    } else {
        int bucket = bucket_of_size(size);
        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
    }
    return size;
}

size_t __cilkrts_size_of_bucket(int bucket)
{
    CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS);
    return FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
}

void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size)
{
    int bucket;
    void *mem;

    /* if too large, or if no worker, fall back to __cilkrts_malloc()  */
    if (!w || size > FRAME_MALLOC_MAX_SIZE) {
        NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE);
        return __cilkrts_malloc(size);
    }

    START_INTERVAL(w, INTERVAL_FRAME_ALLOC); {
        bucket = bucket_of_size(size);
        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);

        while (!(mem = pop(&w->l->free_list[bucket]))) {
            /* get a batch of frames from the global pool */
            START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) {
                allocate_batch(w, bucket, size);
            } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL);
        }
    } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);

    return mem;
}

void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size)
{
    int bucket;
    struct free_list *p = (struct free_list *)p0;

    /* if too large, or if no worker, fall back to __cilkrts_free()  */
    if (!w || size > FRAME_MALLOC_MAX_SIZE) {
        NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE);
        __cilkrts_free(p);
        return;
    }

#if CILK_LIB_DEBUG
    *(volatile long *)w;
#endif

    START_INTERVAL(w, INTERVAL_FRAME_FREE); {
        bucket = bucket_of_size(size);
        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
        w->l->bucket_potential[bucket] += size;
        push(&w->l->free_list[bucket], p);
        if (w->l->bucket_potential[bucket] >
            w->g->frame_malloc.potential_limit) {
            START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) {
                gc_bucket(w, bucket, size);
            } STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL);
        }
    } STOP_INTERVAL(w, INTERVAL_FRAME_FREE);
}

void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w)
{
    int i;
    local_state *l = w->l;

    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
        l->free_list[i] = 0;
        l->bucket_potential[i] = 0;
    }
}

void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w)
{
    int i;
    // Move memory to the global pool.  This operation
    // ensures the memory does not become unreachable / leak
    // when the worker is destroyed.
    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
        move_bucket_to_global_free_list(w, i);
    }
}

/*
  Local Variables: **
  c-file-style:"bsd" **
  c-basic-offset:4 **
  indent-tabs-mode:nil **
  End: **
*/