1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * Copyright (C) 2011 Norbert Thiebaud
7 /* define to activate stats reporting on hash usage*/
8 /* #define HASH_STAT */
10 /* ===============================================
11 * Set-up: defines to identify the system and system related properties
12 * ===============================================
17 #undef CORE_BIG_ENDIAN
18 #define CORE_LITTLE_ENDIAN
19 #define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */
21 #define CORE_BIG_ENDIAN
22 #undef CORE_LITTLE_ENDIAN
23 #define USE_MEMORY_ALIGNMENT 4
28 #define CORE_BIG_ENDIAN
29 #undef CORE_LITTLE_ENDIAN
30 #define USE_MEMORY_ALIGNMENT 4
35 #undef CORE_BIG_ENDIAN
36 #define CORE_LITTLE_ENDIAN
37 #define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */
38 #endif /* Def _MSC_VER */
40 #if defined(__linux) || defined(__OpenBSD__) || \
41 defined(__FreeBSD__) || defined(__NetBSD__) || \
42 defined(__DragonFly__) || defined(__FreeBSD_kernel__)
43 #include <sys/param.h>
44 #if __BYTE_ORDER == __LITTLE_ENDIAN
45 #undef CORE_BIG_ENDIAN
46 #define CORE_LITTLE_ENDIAN
47 #if defined(__x86_64) || defined(__i386)
48 #define USE_MEMORY_ALIGNMENT 64
50 #define USE_MEMORY_ALIGNMENT 4
52 #else /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */
53 #if __BYTE_ORDER == __BIG_ENDIAN
54 #define CORE_BIG_ENDIAN
55 #undef CORE_LITTLE_ENDIAN
56 #define USE_MEMORY_ALIGNMENT 4
57 #endif /* __BYTE_ORDER == __BIG_ENDIAN */
58 #endif /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */
59 #endif /* Def __linux || Def *BSD */
63 #define CORE_BIG_ENDIAN
64 #undef CORE_LITTLE_ENDIAN
65 #define USE_MEMORY_ALIGNMENT 4
66 #else /* Ndef __sparc */
67 #undef CORE_BIG_ENDIAN
68 #define CORE_LITTLE_ENDIAN
69 #define USE_MEMORY_ALIGNMENT 4
70 #endif /* Ndef __sparc */
71 #endif /* Def __sun */
73 /* Note USE_MEMORY_ALIGNMENT is 4 for platform that allow short non-aligned but required int access to be aligned (e.g sparc, ppc, zos..)
74 * USE_MEMORY_ALIGNMENT is 2 for platform that require short and int access to be aligned (e.g hppa )
75 * if the platform does not have alignment requirement (x86/amd64) use a big value (i.e > 16)
77 #ifndef USE_MEMORY_ALIGNMENT
78 #error "USE_MEMORY_ALIGNMENT must be defined to the proper alignment value for the platform"
84 #include <sys/types.h>
99 #define FILE_O_RDONLY _O_RDONLY
100 #define FILE_O_BINARY _O_BINARY
101 #define PATHNCMP _strnicmp /* MSVC converts paths to lower-case sometimes? */
102 #define inline __inline
104 #define S_ISREG(mode) (((mode) & _S_IFMT) == (_S_IFREG)) /* MSVC does not have this macro */
105 #else /* not windaube */
106 #define FILE_O_RDONLY O_RDONLY
107 #define FILE_O_BINARY 0
108 #define PATHNCMP strncmp
109 #endif /* not windaube */
118 int internal_boost
= 0;
119 static char* base_dir
;
120 static char* work_dir
;
123 #define clz __builtin_clz
125 static inline int clz(unsigned int value
)
138 #if (USE_MEMORY_ALIGNMENT > 4)
139 #define get_unaligned_uint(str) (*(unsigned int*)(str))
141 static inline unsigned int get_unaligned_uint(const unsigned char* cursor
)
145 memcpy(&result
, cursor
, sizeof(unsigned int));
150 /* ===============================================
151 * memory pool for fast fix-size allocation (non-tread-safe)
152 * ===============================================
156 void* head_free
; /**< head of a linked list of freed element */
157 char* fresh
; /**< top of a memory block to dig new element */
158 char* tail
; /**< to detect end of extent... when fresh pass tail */
159 void* extent
; /**< pointer to the primary extent block */
160 int size_elem
; /**< size of an element. */
161 int primary
; /**< primary allocation in bytes */
162 int secondary
; /**< secondary allocation in bytes */
164 #define POOL_ALIGN_INCREMENT 8 /**< Alignement, must be a power of 2 and of size > to sizeof(void*) */
167 static void* pool_take_extent(struct pool
* pool
, int allocate
)
169 unsigned int size
= 0;
175 /* we already have an extent, so this is a secondary */
178 size
= pool
->secondary
;
183 assert(pool
->primary
);
184 size
= pool
->primary
;
188 extent
= malloc(size
);
191 *(void**)extent
= pool
->extent
;
192 pool
->extent
= extent
;
195 data
= ((char*)extent
) + POOL_ALIGN_INCREMENT
;
196 pool
->fresh
= ((char*)data
) + pool
->size_elem
;
197 pool
->tail
= pool
->fresh
+ (size
- pool
->size_elem
);
201 pool
->fresh
= ((char*)extent
) + POOL_ALIGN_INCREMENT
;
202 pool
->tail
= pool
->fresh
+ (size
- pool
->size_elem
);
209 /* Create a memory pool for fix size objects
210 * this is a simplified implementation that
211 * is _not_ thread safe.
213 struct pool
* pool_create(int size_elem
, int primary
, int secondary
)
218 assert(secondary
>= 0);
219 assert(size_elem
> 0);
221 pool
= (struct pool
*)calloc(1, sizeof(struct pool
));
222 if(!pool
) return NULL
;
223 /* Adjust the element size so that it be aligned, and so that an element could
224 * at least contain a void*
226 pool
->size_elem
= size_elem
= (size_elem
+ POOL_ALIGN_INCREMENT
- 1) & ~(POOL_ALIGN_INCREMENT
- 1);
228 pool
->primary
= (size_elem
* primary
) + POOL_ALIGN_INCREMENT
;
229 pool
->secondary
= secondary
> 0 ? (size_elem
* secondary
) + POOL_ALIGN_INCREMENT
: 0;
230 pool_take_extent(pool
, FALSE
);
236 void pool_destroy(struct pool
* pool
)
243 extent
= pool
->extent
;
246 next
= *(void**)extent
;
254 static inline void* pool_alloc(struct pool
* pool
)
258 data
= pool
->head_free
;
261 /* we have no old-freed elem */
262 if(pool
->fresh
<= pool
->tail
)
264 /* pick a slice of the current extent */
265 data
= (void*)pool
->fresh
;
266 pool
->fresh
+= pool
->size_elem
;
270 /* allocate a new extent */
271 data
= pool_take_extent(pool
, TRUE
);
276 /* re-used old freed element by chopipng the head of the free list */
277 pool
->head_free
= *(void**)data
;
284 static inline void pool_free(struct pool
* pool
, void* data
)
286 assert(pool
&& data
);
288 /* stack on top of the free list */
289 *(void**)data
= pool
->head_free
;
290 pool
->head_free
= data
;
294 /* ===============================================
295 * Hash implementation custumized to be just tracking
296 * a unique list of string (i.e no data associated
297 * with the key, no need for retrieval, etc..
299 * This is tuned for the particular use-case we have here
300 * measures in tail_build showed that
301 * we can get north of 4000 distinct values stored in a hash
302 * the collision rate is at worse around 2%
303 * the collision needing an expensive memcmp to resolve
304 * have a rate typically at 1 per 1000
305 * for tail_build we register 37229 unique key
306 * with a total of 377 extra memcmp needed
307 * which is completely negligible compared to the
308 * number of memcmp required to eliminate duplicate
309 * entry (north of 2.5 millions for tail_build)
310 * ===============================================
315 struct hash_elem
* next
;
322 struct hash_elem
** array
;
323 struct pool
* elems_pool
;
327 unsigned int load_limit
;
335 #define HASH_F_NO_RESIZE (1<<0)
337 /* The following hash_compute function was adapted from :
338 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
340 * The changes from the original are mostly cosmetic
342 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
345 #if defined CORE_BIG_ENDIAN
346 #define MASK_C1 0xFFFFFF00
347 #define MASK_C2 0xFFFF0000
348 #define MASK_C3 0xFF000000
349 #elif defined CORE_LITTLE_ENDIAN
350 #define MASK_C1 0xFFFFFF
351 #define MASK_C2 0xFFFF
354 #error "Missing Endianness definition"
360 a -= c; a ^= rot(c, 4); c += b; \
361 b -= a; b ^= rot(a, 6); a += c; \
362 c -= b; c ^= rot(b, 8); b += a; \
363 a -= c; a ^= rot(c,16); c += b; \
364 b -= a; b ^= rot(a,19); a += c; \
365 c -= b; c ^= rot(b, 4); b += a; \
367 #define final(a,b,c) \
369 c ^= b; c -= rot(b,14); \
370 a ^= c; a -= rot(c,11); \
371 b ^= a; b -= rot(a,25); \
372 c ^= b; c -= rot(b,16); \
373 a ^= c; a -= rot(c,4); \
374 b ^= a; b -= rot(a,14); \
375 c ^= b; c -= rot(b,24); \
378 static unsigned int hash_compute( struct hash
* hash
, const char* key
, int length
)
382 unsigned int c
; /* internal state */
383 const unsigned char* uk
= (const unsigned char*)key
;
385 /* Set up the internal state */
386 a
= b
= c
= 0xdeadbeef + (length
<< 2);
388 /* we use this to 'hash' full path with mostly a common root
389 * let's now waste too much cycles hashing mostly constant stuff
396 /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
399 a
+= get_unaligned_uint(uk
);
400 b
+= get_unaligned_uint(uk
+4);
401 c
+= get_unaligned_uint(uk
+8);
407 /*----------------------------- handle the last (probably partial) block */
408 /* Note: we possibly over-read, which would trigger complaint from VALGRIND
409 * but we mask the undefined stuff if any, so we are still good, thanks
410 * to alignment of memory allocation and tail-memory management overhead
411 * we always can read 3 bytes past the official end without triggering
412 * a segfault -- if you find a platform/compiler couple for which that postulat
413 * is false, then you just need to over-allocate by 2 more bytes in file_load()
414 * file_load already over-allocate by 1 to sitck a \0 at the end of the buffer.
418 case 12: c
+=get_unaligned_uint(uk
+8); b
+=get_unaligned_uint(uk
+4); a
+=get_unaligned_uint(uk
); break;
419 case 11: c
+=get_unaligned_uint(uk
+8) & MASK_C1
; b
+=get_unaligned_uint(uk
+4); a
+=get_unaligned_uint(uk
); break;
420 case 10: c
+=get_unaligned_uint(uk
+8) & MASK_C2
; b
+=get_unaligned_uint(uk
+4); a
+=get_unaligned_uint(uk
); break;
421 case 9 : c
+=get_unaligned_uint(uk
+8) & MASK_C3
; b
+=get_unaligned_uint(uk
+4); a
+=get_unaligned_uint(uk
); break;
422 case 8 : b
+=get_unaligned_uint(uk
+4); a
+=get_unaligned_uint(uk
); break;
423 case 7 : b
+=get_unaligned_uint(uk
+4) & MASK_C1
; a
+=get_unaligned_uint(uk
); break;
424 case 6 : b
+=get_unaligned_uint(uk
+4) & MASK_C2
; a
+=get_unaligned_uint(uk
); break;
425 case 5 : b
+=get_unaligned_uint(uk
+4) & MASK_C3
; a
+=get_unaligned_uint(uk
); break;
426 case 4 : a
+=get_unaligned_uint(uk
); break;
427 case 3 : a
+=get_unaligned_uint(uk
) & MASK_C1
; break;
428 case 2 : a
+=get_unaligned_uint(uk
) & MASK_C2
; break;
429 case 1 : a
+=get_unaligned_uint(uk
) & MASK_C3
; break;
430 case 0 : return c
& hash
->size
; /* zero length strings require no mixing */
434 return c
& hash
->size
;
437 static void hash_destroy(struct hash
* hash
)
447 pool_destroy(hash
->elems_pool
);
453 static struct hash
* hash_create(unsigned int size
)
458 hash
= calloc(1, sizeof(struct hash
));
461 size
+= (size
>> 2) + 1; /* ~ 75% load factor */
464 hash
->size
= (((unsigned int)0xFFFFFFFF) >> clz((unsigned int)size
));
468 hash
->size
= size
= 15;
470 hash
->load_limit
= hash
->size
- (hash
->size
>> 2);
472 hash
->array
= (struct hash_elem
**)calloc(hash
->size
+ 1, sizeof(struct hash_elem
*));
473 if(hash
->array
== NULL
)
481 hash
->elems_pool
= pool_create(sizeof(struct hash_elem
),
483 if(!hash
->elems_pool
)
492 static void hash_resize(struct hash
* hash
)
494 unsigned int old_size
= hash
->size
;
496 struct hash_elem
* hash_elem
;
497 struct hash_elem
* next
;
498 struct hash_elem
** array
;
501 hash
->size
= (old_size
<< 1) + 1;
502 /* we really should avoid to get there... so print a message to alert of the condition */
503 fprintf(stderr
, "resize hash %d -> %d\n", old_size
, hash
->size
);
504 if(hash
->size
== old_size
)
506 hash
->flags
|= HASH_F_NO_RESIZE
;
509 array
= calloc(hash
->size
+ 1, sizeof(struct hash_elem
*));
512 hash
->load_limit
= hash
->size
- (hash
->size
>> 2);
513 for(i
=0; i
<= old_size
; i
++)
515 hash_elem
= (struct hash_elem
*)hash
->array
[i
];
518 next
= hash_elem
->next
;
520 hashed
= hash_compute(hash
, hash_elem
->key
, hash_elem
->key_len
);
521 hash_elem
->next
= array
[hashed
];
522 array
[hashed
] = hash_elem
;
527 hash
->array
= (struct hash_elem
**)array
;
531 hash
->size
= old_size
;
532 hash
->flags
|= HASH_F_NO_RESIZE
;
537 static inline int compare_key(struct hash
* hash
, const char* a
, const char* b
, int len
, int* cost
)
541 return memcmp(a
,b
, len
);
544 #define compare_key(h,a,b,l,c) memcmp(a,b,l)
547 /* a customized hash_store function that just store the key and return
548 * TRUE if the key was effectively stored, or FALSE if the key was already there
550 static int hash_store(struct hash
* hash
, const char* key
, int key_len
)
553 struct hash_elem
* hash_elem
;
557 hashed
= hash_compute(hash
, key
, key_len
);
561 hash_elem
= (struct hash_elem
*)hash
->array
[hashed
];
562 while(hash_elem
&& (hash_elem
->key_len
!= key_len
|| compare_key(hash
, hash_elem
->key
, key
, key_len
, &cost
)))
564 hash_elem
= hash_elem
->next
;
569 hash_elem
= pool_alloc(hash
->elems_pool
);
572 hash_elem
->key
= key
;
573 hash_elem
->key_len
= key_len
;
574 hash_elem
->next
= hash
->array
[hashed
];
579 hash
->collisions
+= 1;
583 hash
->array
[hashed
] = hash_elem
;
585 if(hash
->used
> hash
->load_limit
)
595 static int file_stat(const char* name
, struct stat
* buffer_stat
, int* rc
)
599 rc_local
= stat(name
, buffer_stat
);
607 static off_t
file_get_size(const char* name
, int* rc
)
609 struct stat buffer_stat
;
612 if (!file_stat(name
, &buffer_stat
, rc
))
614 if(S_ISREG(buffer_stat
.st_mode
))
616 size
= buffer_stat
.st_size
;
626 static char* file_load(const char* name
, off_t
* size
, int* return_rc
)
628 off_t local_size
= 0;
633 assert(name
!= NULL
);
639 *size
= file_get_size(name
, &rc
);
642 fd
= open(name
, FILE_O_RDONLY
| FILE_O_BINARY
);
645 buffer
= malloc((size_t)(*size
+ 1));
655 i
= read(fd
, buffer
, (size_t)(*size
));
692 static void _cancel_relative(char* base
, char** ref_cursor
, char** ref_cursor_out
, char* end
)
694 char* cursor
= *ref_cursor
;
695 char* cursor_out
= *ref_cursor_out
;
700 while(cursor_out
> base
&& cursor_out
[-1] == '/')
702 while(cursor_out
> base
&& *--cursor_out
!= '/');
704 while(cursor
+ 3 < end
&& !memcmp(cursor
, "/../", 4));
705 *ref_cursor
= cursor
;
706 *ref_cursor_out
= cursor_out
;
709 static inline void eat_space(char ** token
)
711 while ((' ' == **token
) || ('\t' == **token
)) {
717 * Prune LibreOffice specific duplicate dependencies to improve
718 * gnumake startup time, and shrink the disk-space footprint.
721 elide_dependency(const char* key
, int key_len
, const char **unpacked_end
)
726 fprintf (stderr
, "elide?%d!: '", internal_boost
);
727 for (i
= 0; i
< key_len
; i
++) {
728 fprintf (stderr
, "%c", key
[i
]);
730 fprintf (stderr
, "'\n");
734 /* boost brings a plague of header files */
737 /* walk down path elements */
738 for (i
= 0; i
< key_len
- 1; i
++)
744 if (!PATHNCMP(key
+ i
+ 1, "workdir/", 8))
752 if (!PATHNCMP(key
+ i
+ 1, "UnpackedTarball/", 16))
755 *unpacked_end
= strchr(key
+ i
+ 17, '/');
766 * We collapse tens of internal boost headers to the unpacked target, such
767 * that you can re-compile / install boost and all is well.
769 static void emit_single_boost_header(void)
771 #define BOOST_TARGET "/UnpackedTarball/boost.done"
772 fprintf(stdout
, "%s" BOOST_TARGET
" ", work_dir
);
775 static void emit_unpacked_target(const char* token
, const char* end
)
777 fwrite(token
, 1, end
-token
, stdout
);
778 fputs(".done ", stdout
);
781 /* prefix paths to absolute */
782 static inline void print_fullpaths(char* line
)
788 const char * unpacked_end
= 0; /* end of UnpackedTarget match (if any) */
789 /* for UnpackedTarget the target is GenC{,xx}Object, dont mangle! */
797 /* hard to believe that in this day and age drive letters still exist */
798 if (*end
&& (':' == *(end
+1)) &&
799 (('\\' == *(end
+2)) || ('/' == *(end
+2))) && isalpha(*end
))
801 end
= end
+ 3; /* only one cross, err drive letter per filename */
803 while (*end
&& (' ' != *end
) && ('\t' != *end
) && (':' != *end
)) {
806 token_len
= end
- token
;
808 elide_dependency(token
, token_len
, &unpacked_end
))
812 if (internal_boost
&& !PATHNCMP(unpacked_end
- 5, "boost", 5))
815 if (boost_count
== 1)
816 emit_single_boost_header();
819 /* don't output, and swallow trailing \\\n if any */
822 if (token
[0] == '\\' && token
[1] == '\n')
828 emit_unpacked_target(token
, unpacked_end
);
835 if (fwrite(token
, token_len
, 1, stdout
) != 1)
854 static inline char * eat_space_at_end(char * end
)
857 assert('\0' == *end
);
859 while (' ' == *real_end
|| '\t' == *real_end
|| '\n' == *real_end
861 { /* eat colon and whitespace at end */
867 static int _process(struct hash
* dep_hash
, char* fn
)
875 int continuation
= 0;
879 buffer
= file_load(fn
, &size
, &rc
);
880 /* Note: yes we are going to leak 'buffer'
881 * this is on purpose, to avoid cloning the 'key' out of it
882 * and our special 'hash' just store the pointer to the key
883 * inside of buffer, hence it need to remain allocated
887 base
= cursor_out
= cursor
= end
= buffer
;
890 /* first eat unneeded space at the beginning of file
892 while(cursor
< end
&& (*cursor
== ' ' || *cursor
== '\\'))
900 *cursor_out
++ = *cursor
++;
902 else if(*cursor
== '/')
906 if(!memcmp(cursor
, "/../", 4))
908 _cancel_relative(base
, &cursor
, &cursor_out
, end
);
911 *cursor_out
++ = *cursor
++;
913 else if(*cursor
== '\n')
920 /* here we have a complete rule */
923 /* if the rule ended in ':' that is a no-dep rule
924 * these are the one for which we want to filter
927 int key_len
= eat_space_at_end(cursor_out
) - base
;
928 if (!elide_dependency(base
,key_len
+ 1, NULL
)
929 && hash_store(dep_hash
, base
, key_len
))
931 /* DO NOT modify base after it has been added
932 as key by hash_store */
933 print_fullpaths(base
);
939 /* rule with dep, just write it */
940 print_fullpaths(base
);
943 last_ns
= ' '; // cannot hurt to reset it
946 base
= cursor_out
= cursor
;
950 /* here we have a '\' followed by \n this is a continuation
951 * i.e not a complete rule yet
953 *cursor_out
++ = *cursor
++;
954 continuation
= 0; // cancel current one (empty lines!)
960 /* not using isspace() here save 25% of I refs and 75% of D refs based on cachegrind */
961 if(*cursor
!= ' ' && *cursor
!= '\n' && *cursor
!= '\t' )
965 *cursor_out
++ = *cursor
++;
968 /* just in case the file did not end with a \n, there may be a pending rule */
969 if(base
< cursor_out
)
973 int key_len
= eat_space_at_end(cursor_out
) - base
;
974 if (!elide_dependency(base
,key_len
+ 1, NULL
) &&
975 hash_store(dep_hash
, base
, key_len
))
991 static void _usage(void)
993 fputs("Usage: concat-deps <file that contains dep_files>\n", stderr
);
996 #define kDEFAULT_HASH_SIZE 4096
998 static int get_var(char **var
, const char *name
)
1000 *var
= (char *)getenv(name
);
1003 fprintf(stderr
,"Error: %s is missing in the environement\n", name
);
1009 int main(int argc
, char** argv
)
1012 off_t in_list_size
= 0;
1014 char* in_list_cursor
;
1016 struct hash
* dep_hash
;
1017 const char *env_str
;
1024 if(get_var(&base_dir
, "SRCDIR") || get_var(&work_dir
, "WORKDIR"))
1027 env_str
= getenv("SYSTEM_BOOST");
1028 internal_boost
= !env_str
|| strcmp(env_str
,"TRUE");
1030 in_list
= file_load(argv
[1], &in_list_size
, &rc
);
1033 dep_hash
= hash_create( kDEFAULT_HASH_SIZE
);
1034 in_list_base
= in_list_cursor
= in_list
;
1036 /* extract filename of dep file from a 'space' separated list */
1037 while(*in_list_cursor
)
1039 if(*in_list_cursor
== ' ' || *in_list_cursor
== '\n')
1041 *in_list_cursor
= 0;
1042 if(in_list_base
< in_list_cursor
)
1044 rc
= _process(dep_hash
, in_list_base
);
1050 in_list_cursor
+= 1;
1051 in_list_base
= in_list_cursor
;
1055 in_list_cursor
+= 1;
1060 /* catch the last entry in case the input did not terminate with a 'space' */
1061 if(in_list_base
< in_list_cursor
)
1063 rc
= _process(dep_hash
, in_list_base
);
1067 fprintf(stderr
, "stats: u:%d s:%d l:%d t:%d c:%d m:%d $:%d\n",
1068 dep_hash
->used
, dep_hash
->size
, dep_hash
->load_limit
, dep_hash
->stored
,
1069 dep_hash
->collisions
, dep_hash
->memcmp
, dep_hash
->cost
);
1075 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */