add normalizepath_cstr()
[rofl0r-libulz.git] / examples / codedup.c
blob89dcc79f5fc9cd4654cb183834f24a72556f1d03
1 #include "../include/tglist.h"
2 #include "../include/hbmap.h"
3 #include <stdio.h>
4 #include <stdint.h>
5 #include <unistd.h>
6 #include <ctype.h>
7 #include <signal.h>
8 #include <assert.h>
10 static unsigned string_hash(const char* s) {
11 uint_fast32_t h = 0;
12 while (*s) {
13 h = 16*h + *s++;
14 h ^= h>>24 & 0xf0;
16 return h & 0xfffffff;
19 static int line_equal(unsigned hash1, char* s1, unsigned hash2, char* s2) {
20 return hash1 == hash2 && !strcmp(s1, s2);
23 static int intcmp(const void*c1, const void*c2) {
24 const int* i1 = c1;
25 const int* i2 = c2;
26 return *i1 - *i2;
29 static int usage(char *a0) {
30 dprintf(2,
31 "%s - duplicate text/code finder\n"
32 "finds the biggest duplicated blocks\n"
33 "usage %s [-w] [-i min] [-s start] [-e end] file\n"
34 "-w: remove leading whitespace\n"
35 "-i min: specify minimal block length in lines (default: 4)\n"
36 "-s start: specify start line (0-based index)\n"
37 "-e end: specify end line (0-based index)\n"
38 "-a atime: number of seconds before updating status (0: no status)\n"
39 "file may be specified as '-' in which case stdin is used\n"
40 "output goes to stdout.\n"
41 , a0, a0);
42 return 1;
45 static unsigned inthash(int i) {return i;}
47 static int sigc, neednl;
48 static int cl, nl, stdout_is_tty, atime = 1;
49 static void sigh(int nsig) {
50 sigc++;
51 dprintf(2, "\rprocessed %d/%d (%.2f%%), elapsed %d", cl, nl, (cl/(float)nl)*100.f, sigc*atime);
52 alarm(atime);
53 neednl = 1;
56 static int sortitem_cmp(const void* a, const void* b) {
57 const struct { int dupes; int lineno; } *a1 =a, *b1 = b;
58 int ret = b1->dupes - a1->dupes;
59 if(ret == 0) return a1->lineno - b1->lineno;
60 return ret;
63 int main(int argc, char **argv) {
64 int c, whitespace_flag = 0;
65 size_t blocklen_min = 4, start = 0, end = -1;
66 while((c = getopt(argc, argv, "wi:s:e:a:")) != -1) switch(c) {
67 case 'w': whitespace_flag = 1; break;
68 case 'i': blocklen_min = atoi(optarg); break;
69 case 's': start = atoi(optarg); break;
70 case 'e': end = atoi(optarg); break;
71 case 'a': atime = atoi(optarg); break;
72 default: return usage(argv[0]);
74 if(!argv[optind]) return usage(argv[0]);
75 FILE *f = argv[optind][0] == '-' && !argv[optind][1] ? stdin : fopen(argv[optind], "r");
76 if(!f) {
77 perror("fopen");
78 return 1;
80 stdout_is_tty = isatty(fileno(stdout));
81 struct sortitem { int dupes; int lineno; };
82 tglist(unsigned) hashlist;
83 tglist(char*) strlist;
84 typedef tglist(struct sortitem) sortitem_list; /* we need a named type to prevent pointer type mismatch warnings */
85 hbmap(int, sortitem_list*, 1024) dupes;
86 tglist_init(&hashlist);
87 tglist_init(&strlist);
88 hbmap_init(&dupes, intcmp, inthash);
89 char buf[1024];
90 size_t i = 0;
91 while(fgets(buf, sizeof buf, f)) {
92 ++i;
93 if(i - 1 < start) continue;
94 if(end != (size_t) -1 && i >= end) break;
95 char *p = buf, *q = strrchr(p, '\n');
96 if(q) *q = 0;
97 if(whitespace_flag) while(isspace(*p)) p++;
98 unsigned hash = string_hash(p);
99 tglist_add(&hashlist, hash);
100 tglist_add(&strlist, strdup(p));
101 assert(tglist_getsize(&strlist) == i - start);
102 assert(tglist_getsize(&strlist) == tglist_getsize(&hashlist));
103 assert(!strcmp(p, tglist_get(&strlist, i-start-1)));
105 size_t linecount = tglist_getsize(&hashlist);
106 size_t bl;
107 nl = linecount;
109 if(atime) {
110 signal(SIGALRM, sigh);
111 alarm(atime);
114 //for(bl = blocklen_max; bl >= blocklen_min; bl--) {
115 for(bl = blocklen_min; bl < blocklen_min+1; bl++) {
116 for(i = 0; i < linecount; ) {
117 hbmap_iter hi;
118 int dupe_count = 0, longest_dupe = 0;
119 size_t j, h, last_dupe = 0;
120 for(j = i + bl; j + bl < linecount; ) {
121 #define HL(X) tglist_get(&hashlist, X)
122 #define SL(X) tglist_get(&strlist, X)
123 for(h = 0; h < bl; ++h) {
124 if(!line_equal(HL(i+h), SL(i+h), HL(j+h), SL(j+h)))
125 break;
127 if(h == bl) {
128 /* look if the dupe is actually longer */
129 while(j+h < linecount && i+h < j && line_equal(HL(i+h), SL(i+h), HL(j+h), SL(j+h)))
130 h++;
131 sortitem_list* dlist = 0;
132 hi = hbmap_find(&dupes, i);
134 if(hi == (hbmap_iter) -1)
135 dlist = tglist_new();
136 else
137 dlist = hbmap_getval(&dupes, hi);
139 struct sortitem tmp = {.dupes = h, .lineno = j};
140 tglist_insert_sorted(dlist, tmp, sortitem_cmp);
142 if(hi == (hbmap_iter) -1)
143 hbmap_insert(&dupes, i, dlist);
145 last_dupe = j;
146 if(h > longest_dupe) longest_dupe = h;
147 ++dupe_count;
150 j+= h ? h : 1;
152 i += dupe_count ? longest_dupe : 1;
153 cl = i;
156 hbmap_iter hi;
157 sortitem_list si_list;
158 tglist_init(&si_list);
159 hbmap_foreach(&dupes, hi) {
160 int longest = 0;
161 sortitem_list*dlist = hbmap_getval(&dupes, hi);
162 tglist_foreach(dlist, i) {
163 struct sortitem *si = &tglist_get(dlist, i);
164 if(si->dupes > longest) longest = si->dupes;
166 assert(longest == tglist_get(dlist, 0).dupes);
167 struct sortitem si = {.dupes = longest, .lineno = hbmap_getkey(&dupes, hi)};
168 tglist_insert_sorted(&si_list, si, sortitem_cmp);
170 for(i = 0; i < tglist_getsize(&si_list); i++) {
171 struct sortitem *si = &tglist_get(&si_list, i);
172 hi = hbmap_find(&dupes, si->lineno);
173 sortitem_list *dlist = hbmap_getval(&dupes, hi);
174 int j, cnt = 0;
175 tglist_foreach(dlist, j) {
176 if(tglist_get(dlist, j).dupes < si->dupes) break;
177 ++cnt;
179 if(neednl && stdout_is_tty) {
180 dprintf(2, "\n");
181 neednl = 0;
183 printf("[DUP] %d lines at %d: %d dupes (", si->dupes, (int) start + si->lineno, cnt);
184 for(j = 0; j < cnt; ++j)
185 printf("%d%s", (int) start + tglist_get(dlist, j).lineno, j == cnt -1 ? ")\n" : ", ");
186 for(j = 0; j < si->dupes; j++)
187 printf("%s\n", tglist_get(&strlist, si->lineno+j));
189 if(f != stdin) fclose(f);
190 if(neednl) dprintf(2, "\n");
191 return 0;