1 #include "../include/tglist.h"
2 #include "../include/hbmap.h"
10 static unsigned string_hash(const char* s
) {
19 static int line_equal(unsigned hash1
, char* s1
, unsigned hash2
, char* s2
) {
20 return hash1
== hash2
&& !strcmp(s1
, s2
);
23 static int intcmp(const void*c1
, const void*c2
) {
29 static int usage(char *a0
) {
31 "%s - duplicate text/code finder\n"
32 "finds the biggest duplicated blocks\n"
33 "usage %s [-w] [-i min] [-s start] [-e end] file\n"
34 "-w: remove leading whitespace\n"
35 "-i min: specify minimal block length in lines (default: 4)\n"
36 "-s start: specify start line (0-based index)\n"
37 "-e end: specify end line (0-based index)\n"
38 "-a atime: number of seconds before updating status (0: no status)\n"
39 "file may be specified as '-' in which case stdin is used\n"
40 "output goes to stdout.\n"
45 static unsigned inthash(int i
) {return i
;}
47 static int sigc
, neednl
;
48 static int cl
, nl
, stdout_is_tty
, atime
= 1;
49 static void sigh(int nsig
) {
51 dprintf(2, "\rprocessed %d/%d (%.2f%%), elapsed %d", cl
, nl
, (cl
/(float)nl
)*100.f
, sigc
*atime
);
56 static int sortitem_cmp(const void* a
, const void* b
) {
57 const struct { int dupes
; int lineno
; } *a1
=a
, *b1
= b
;
58 int ret
= b1
->dupes
- a1
->dupes
;
59 if(ret
== 0) return a1
->lineno
- b1
->lineno
;
63 int main(int argc
, char **argv
) {
64 int c
, whitespace_flag
= 0;
65 size_t blocklen_min
= 4, start
= 0, end
= -1;
66 while((c
= getopt(argc
, argv
, "wi:s:e:a:")) != -1) switch(c
) {
67 case 'w': whitespace_flag
= 1; break;
68 case 'i': blocklen_min
= atoi(optarg
); break;
69 case 's': start
= atoi(optarg
); break;
70 case 'e': end
= atoi(optarg
); break;
71 case 'a': atime
= atoi(optarg
); break;
72 default: return usage(argv
[0]);
74 if(!argv
[optind
]) return usage(argv
[0]);
75 FILE *f
= argv
[optind
][0] == '-' && !argv
[optind
][1] ? stdin
: fopen(argv
[optind
], "r");
80 stdout_is_tty
= isatty(fileno(stdout
));
81 struct sortitem
{ int dupes
; int lineno
; };
82 tglist(unsigned) hashlist
;
83 tglist(char*) strlist
;
84 typedef tglist(struct sortitem
) sortitem_list
; /* we need a named type to prevent pointer type mismatch warnings */
85 hbmap(int, sortitem_list
*, 1024) dupes
;
86 tglist_init(&hashlist
);
87 tglist_init(&strlist
);
88 hbmap_init(&dupes
, intcmp
, inthash
);
91 while(fgets(buf
, sizeof buf
, f
)) {
93 if(i
- 1 < start
) continue;
94 if(end
!= (size_t) -1 && i
>= end
) break;
95 char *p
= buf
, *q
= strrchr(p
, '\n');
97 if(whitespace_flag
) while(isspace(*p
)) p
++;
98 unsigned hash
= string_hash(p
);
99 tglist_add(&hashlist
, hash
);
100 tglist_add(&strlist
, strdup(p
));
101 assert(tglist_getsize(&strlist
) == i
- start
);
102 assert(tglist_getsize(&strlist
) == tglist_getsize(&hashlist
));
103 assert(!strcmp(p
, tglist_get(&strlist
, i
-start
-1)));
105 size_t linecount
= tglist_getsize(&hashlist
);
110 signal(SIGALRM
, sigh
);
114 //for(bl = blocklen_max; bl >= blocklen_min; bl--) {
115 for(bl
= blocklen_min
; bl
< blocklen_min
+1; bl
++) {
116 for(i
= 0; i
< linecount
; ) {
118 int dupe_count
= 0, longest_dupe
= 0;
119 size_t j
, h
, last_dupe
= 0;
120 for(j
= i
+ bl
; j
+ bl
< linecount
; ) {
121 #define HL(X) tglist_get(&hashlist, X)
122 #define SL(X) tglist_get(&strlist, X)
123 for(h
= 0; h
< bl
; ++h
) {
124 if(!line_equal(HL(i
+h
), SL(i
+h
), HL(j
+h
), SL(j
+h
)))
128 /* look if the dupe is actually longer */
129 while(j
+h
< linecount
&& i
+h
< j
&& line_equal(HL(i
+h
), SL(i
+h
), HL(j
+h
), SL(j
+h
)))
131 sortitem_list
* dlist
= 0;
132 hi
= hbmap_find(&dupes
, i
);
134 if(hi
== (hbmap_iter
) -1)
135 dlist
= tglist_new();
137 dlist
= hbmap_getval(&dupes
, hi
);
139 struct sortitem tmp
= {.dupes
= h
, .lineno
= j
};
140 tglist_insert_sorted(dlist
, tmp
, sortitem_cmp
);
142 if(hi
== (hbmap_iter
) -1)
143 hbmap_insert(&dupes
, i
, dlist
);
146 if(h
> longest_dupe
) longest_dupe
= h
;
152 i
+= dupe_count
? longest_dupe
: 1;
157 sortitem_list si_list
;
158 tglist_init(&si_list
);
159 hbmap_foreach(&dupes
, hi
) {
161 sortitem_list
*dlist
= hbmap_getval(&dupes
, hi
);
162 tglist_foreach(dlist
, i
) {
163 struct sortitem
*si
= &tglist_get(dlist
, i
);
164 if(si
->dupes
> longest
) longest
= si
->dupes
;
166 assert(longest
== tglist_get(dlist
, 0).dupes
);
167 struct sortitem si
= {.dupes
= longest
, .lineno
= hbmap_getkey(&dupes
, hi
)};
168 tglist_insert_sorted(&si_list
, si
, sortitem_cmp
);
170 for(i
= 0; i
< tglist_getsize(&si_list
); i
++) {
171 struct sortitem
*si
= &tglist_get(&si_list
, i
);
172 hi
= hbmap_find(&dupes
, si
->lineno
);
173 sortitem_list
*dlist
= hbmap_getval(&dupes
, hi
);
175 tglist_foreach(dlist
, j
) {
176 if(tglist_get(dlist
, j
).dupes
< si
->dupes
) break;
179 if(neednl
&& stdout_is_tty
) {
183 printf("[DUP] %d lines at %d: %d dupes (", si
->dupes
, (int) start
+ si
->lineno
, cnt
);
184 for(j
= 0; j
< cnt
; ++j
)
185 printf("%d%s", (int) start
+ tglist_get(dlist
, j
).lineno
, j
== cnt
-1 ? ")\n" : ", ");
186 for(j
= 0; j
< si
->dupes
; j
++)
187 printf("%s\n", tglist_get(&strlist
, si
->lineno
+j
));
189 if(f
!= stdin
) fclose(f
);
190 if(neednl
) dprintf(2, "\n");