util.c

   1 // Librería para obtener mmap
   2 //  caddr_t mmap(void *start, size_t length, int prot , int flags, int fd, off_t offset);
   3 // fd -> open()
   4 // size/length -> stat()
   5
   6 #include <sys/mman.h>
   7
   8 #include <sys/types.h>
   9 #include <sys/stat.h>
  10 #include <fcntl.h>
  11
  12 //       fd = int open(const char *camino, int flags);
  13 // flags -> O_RDONLY
  14
  15 //   int fstat(int filedes, struct stat *buf);
  16 //      Total_Size = buf->st_size
  17
  18
  19 #include <gcrypt.h>
  20 #include <ctype.h>
  21 #include <string.h>
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <dlfcn.h>
  26
  27 #include "util.h"
  28
  29 void *libsplit_handle;
  30 void (*libsplit_init)();
  31 int (*libsplit_loadbuffer)(char *buff, int size, int nfile);
  32 int (*libsplit_split)(int nfile,int *pblocks);
  33
  34
  35
  36 int loadsplitplugin(void) {
  37         const char *error;
  38         libsplit_handle = dlopen ("./plugin-split-lines.so", RTLD_LAZY);
  39         if (!libsplit_handle) {
  40                 fputs (dlerror(), stderr);
  41                 return 0;
  42         }
  43
  44         libsplit_init = dlsym(libsplit_handle, "init");
  45         libsplit_loadbuffer = dlsym(libsplit_handle, "loadbuffer");
  46         libsplit_split = dlsym(libsplit_handle, "split");
  47         if ((error = dlerror()) != NULL)  {
  48                 fputs(error, stderr);
  49                 return 0;
  50         }
  51         return 1;
  52 }
  53
  54 int unloadsplitplugin(void) {
  55         dlclose(libsplit_handle);
  56         return 1;
  57 }
  58
  59 char *loadfile(char *filename, int *size) {
  60
  61         int fd = open(filename, O_RDONLY);
  62
  63   *size=0;
  64         struct stat buf;
  65
  66   if (fstat(fd, &buf)==-1) return NULL;
  67
  68   int filesize = buf.st_size;
  69         char *data=mmap(0, filesize, PROT_READ , MAP_SHARED, fd, 0);
  70   if ((int) data==-1) return NULL;
  71
  72   *size=filesize;
  73         return data;
  74 }
  75
  76
  77 unsigned int *hash_loadfile(char *filename, int *size) {
  78         char *nombre=filename, linea[MAX_LINE];
  79         FILE *fichero;
  80         fichero = fopen( nombre, "r" );
  81
  82         if( !fichero ) {
  83                 printf( "Error (NO ABIERTO)\n" );
  84                 return NULL;
  85         }
  86         char *txt;
  87
  88         int lines;
  89         for (lines=0;fgets(linea, MAX_LINE, fichero); lines++);
  90         rewind(fichero);
  91         *size=lines;
  92         unsigned int *data=malloc(lines*sizeof(unsigned int*));
  93         int line=0;
  94         while (txt=fgets(linea, MAX_LINE, fichero)) {
  95                 reducetext(txt);
  96                 data[line++]=ihash(txt);
  97         }
  98
  99         if( fclose(fichero)!=0 ) {
 100                 printf( "\nError: fichero NO CERRADO\n" );
 101                 return NULL;
 102         }
 103
 104         return data;
 105
 106 }
 107
 108 // ihash calcula un hash de 32 bits para un texto.
 109 unsigned int ihash(char *txt) {
 110         // Longitud del mensaje a cifrar
 111         int msg_len = strlen( txt );
 112
 113         //  Longitud del hash resultante - gcry_md_get_algo_dlen
 114         // devuelve la longitud del resumen hash para un algoritmo
 115         int hash_len = gcry_md_get_algo_dlen( HASH_TYPE );
 116
 117         // Salida del hash SHA1 - esto serán datos binarios
 118         unsigned char hash[ hash_len ];
 119
 120         // Calcular el resumen SHA1. Esto es una especie de función-atajo,
 121         // ya que la mayoría de funciones gcrypt requieren
 122         // la creación de un handle, etc.
 123         gcry_md_hash_buffer( HASH_TYPE, hash, txt, msg_len );
 124
 125         //      unsigned int ihash=*((unsigned int *)hash);
 126         return *((unsigned int *)hash);
 127 }
 128
 129
 130 void reducetext(char * txt) {
 131
 132         int n=0, nn=0;
 133         char newline[256];
 134         char lastc=0;
 135         char c=txt[n];
 136         char type=0; // Tipos de palabras o grupos:
 137         // a -> texto, variable.
 138         // 1 -> números, con, sin decimales.
 139         // % -> símbolos unarios, binarios.
 140         // 0 -> huecos y espacios
 141
 142         for (n=0;n<MAX_LINE;n++) {
 143                 c=tolower(txt[n]); // Captura del carácter en minúscula.
 144
 145                 // Traducción del carácter.
 146                 switch(c)
 147                 {
 148                         // Retonos de carro y fin de fichero: salir de la función.
 149                         case 10:
 150                         case 13:
 151                         case 0:
 152                                 n=MAX_LINE; continue;
 153                         // Tabuladores y espacios: cuentan como espacio.
 154                         case ' ':
 155                         case '\t':
 156                                 c=' '; break;
 157                         // Acentos.
 158 /*
 159                         case 'á': c='a'; break;
 160                         case 'é': c='e'; break;
 161                         case 'í': c='i'; break;
 162                         case 'ó': c='o'; break;
 163                         case 'ú': c='u'; break;
 164                         */
 165                 }
 166
 167                 switch(type) // Cambios de tipos según algunos datos.
 168                 {
 169                         case 0: // Segun si estábamos en un espacio.
 170                                 if (c>='0' && c<='9') {
 171                                         type='1';
 172                                 } else if (isalpha(c)) {
 173                                         type='a';
 174                                 } else type='%';
 175                         break;
 176                         case '1': // Segun si estábamos en un espacio.
 177                                 if (c>='0' && c<='9') {
 178                                         type='1';
 179                                 } else if (c=='.') {
 180                                         type='1';
 181                                 } else if (isalpha(c)) {
 182                                         type='a';
 183                                 } else type='%';
 184                         break;
 185                         case 'a': // Segun si estábamos en un espacio.
 186                                 if (c>='0' && c<='9') {
 187                                         type='a';
 188                                 } else if (isalpha(c)) {
 189                                         type='a';
 190                                 } else type='%';
 191                         break;
 192                         default:
 193                         case '%':
 194                                 if (c==' ') {
 195                                         continue;
 196                                 } else
 197                                 if (c>='0' && c<='9') {
 198                                         type='1';
 199                                 } else if (isalpha(c)) {
 200                                         type='a';
 201                                 } else type='%';
 202                         break;
 203                 }
 204                 if (c==' ') type=0;
 205                 if (!nn && c==' ') continue; // Si está tabulando al inicio, tampoco tiene efecto.
 206                 if (c==lastc && (type=='a' || type==0)) continue; // Desperdiciar letras repetidas.
 207
 208                 if (type=='%' && newline[nn-1]==' ') nn--;
 209                 newline[nn]=c;
 210
 211                 nn++;
 212                 lastc=c;
 213         }
 214         newline[nn]=0;
 215         strcpy(txt,newline);
 216
 217 }
 218
 219
 220 int compare2hashvectors(int *Bvector, int Bsize, int *Mvector, int Msize,
 221         int MaxPassSize, hashblock *blocks, int blocksize)
 222 {
 223                 // Bvector: vector of hashes of Base, original or unmodified file.
 224                 // Mvector: vector of hashes of modified file.
 225                 // Bsize and Msize: Stores the size of their arrays.
 226                 // MaxPassSize: (default: 256) Which is the maximum block-size of algorithm
 227
 228                 int line_base,line_local,size;
 229                 int maxsize=1,total=0;
 230                 int i,k,m;
 231                 hashblock *bloque=blocks;
 232                 int nbloques=0;
 233                 int lbb=0;
 234                 int conf_pasada[]={256,128,64,32,16,8,4,2,1,0};
 235                 int p;
 236                 int min_bloque=0;
 237                 for (p=0;min_bloque=conf_pasada[p];p++)
 238                 {
 239                         if (conf_pasada[p]>MaxPassSize) continue;
 240
 241                         for (i=0;i<Bsize;i+=maxsize) {
 242                                 maxsize=1;
 243                                 int j;
 244                                 for (j=0;j<nbloques;j++)
 245                                 {
 246                                         if (i>=bloque[j].line1 && i<=bloque[j].line1+bloque[j].size) break;
 247                                 }
 248                                 if (j<nbloques)
 249                                 {
 250                                         i=bloque[j].line1+bloque[j].size; continue;
 251                                 }
 252
 253
 254                                 for (k=0;k<Msize;k++)
 255                                 {
 256                                         int j;
 257                                         for (j=0;j<nbloques;j++)
 258                                         {
 259                                                 if (k>=bloque[j].line2 && k<=bloque[j].line2+bloque[j].size) break;
 260                                         }
 261                                         if (j<nbloques)
 262                                         {
 263                                                 k=bloque[j].line2+bloque[j].size; continue;
 264                                         }
 265
 266                                         if (Bvector[i]==Mvector[k])
 267                                         {
 268                                                 int nz=0,nzbl=0;
 269
 270                                                 for(m=0;k+m<Msize && i+m<Bsize;m++)
 271                                                 {
 272                                                         for (j=0;j<nbloques;j++)
 273                                                         {
 274                                                                 if (k+m>=bloque[j].line2 && k+m<=bloque[j].line2+bloque[j].size) break;
 275                                                                 if (i+m>=bloque[j].line1 && i+m<=bloque[j].line1+bloque[j].size) break;
 276                                                         }
 277                                                         if (j<nbloques) break;
 278                                                         if (Bvector[i+m]!=Mvector[k+m])
 279                                                         {
 280                                                                 nz++;
 281                                                                 if (nz>size/4) break;
 282                                                                 continue;
 283                                                         }
 284                                                         if (nz==0) size=m;
 285                                                         else
 286                                                         {
 287                                                                 nzbl++;
 288                                                                 if (nzbl>2)
 289                                                                 {
 290                                                                         nzbl=0;
 291                                                                         nz--;
 292                                                                 }
 293
 294                                                         }
 295                                                 }
 296
 297                                                 if (size>maxsize)
 298                                                 {
 299                                                         maxsize=size;
 300                                                         line_base=i;
 301                                                         line_local=k;
 302                                                 }
 303                                         }
 304                                 }
 305
 306                                 if (maxsize>min_bloque)
 307                                 {
 308                                         if (nbloques<blocksize)
 309                                         {
 310                                         bloque[nbloques].line1=line_base;
 311                                         bloque[nbloques].line2=line_local;
 312                                         bloque[nbloques].size=maxsize;
 313                                         nbloques++;
 314                                         } else printf( "Error: OUT OF BLOCKS. \n");
 315
 316
 317                                         lbb=line_base+maxsize;
 318                                         total+=maxsize;
 319                                 }
 320
 321                         }
 322
 323                 }
 324
 325                 int j;
 326                 {
 327                         hashblock auxbloque[blocksize];
 328                         int minline=0, min_j=0;
 329                         for (p=0;p<nbloques;p++)
 330                         {
 331                                 minline=Bsize;
 332                                 for (j=0;j<64 && j<nbloques;j++)
 333                                 {
 334                                         if (bloque[j].line1<minline)
 335                                         {
 336                                                 minline=bloque[j].line1;
 337                                                 min_j=j;
 338                                         }
 339                                 }
 340                                 auxbloque[p]=bloque[min_j];
 341                                 bloque[min_j].line1=Bsize;
 342                         }
 343                         memcpy(bloque,auxbloque,blocksize*sizeof(hashblock));
 344                 }
 345                 return nbloques;
 346 }