scripts/split_book.pl

   1 #!/usr/bin/perl
   2
   3 # This script has two algorithms in it, a n^2 and an n log n. I haven't
   4 # benchmarked it, but I think they should perform the same on n=2,
   5 # while the n log n one should do worse on n=4 by a factor of 12/10,
   6 # but better on n=15 (Simple Nature's current size) by
   7 # about a factor of 2. I should be able to improve
   8 # the performance of the n log n algorithm by making it into a base-4
   9 # algorithm instead of base-2, but the improvement would only be 20%.
  10
  11 use strict;
  12 use POSIX;
  13 use FindBin qw($Bin);
  14
  15 our ($input_pdf,$nchunks,$kmax,$label);
  16
  17 my $input_pdf = shift @ARGV; # first command-line argument
  18 die "input file $input_pdf not found" unless  -e $input_pdf;
  19 my $chunk_size = 50;
  20 if (@ARGV) {
  21  $chunk_size = shift @ARGV; # optional second arg, pages per chunk
  22 }
  23 die "illegal chunk size=$chunk_size" unless $chunk_size>=1;
  24
  25 my $npages = `$Bin/pdf_page_count.rb $input_pdf`;
  26 unless ($npages>=1 && $npages<=10000) {die "split_book.pl: npages=$npages fails sanity check for input file $input_pdf"}
  27
  28 # The following works, but takes n^2 time, where n is the number of pages.
  29 if (0) {
  30
  31 $label = 'a';
  32
  33 for (my $p=1; $p<=$npages; $p+=$chunk_size) {
  34   my $output_pdf = $input_pdf;
  35   $output_pdf =~ s/\.pdf/${label}.pdf/;
  36   my $q = $p+$chunk_size-1;
  37   if ($q>$npages) {$q=$npages}
  38   my $cmd = "$Bin/pdf_extract_pages.rb $input_pdf $p-$q $output_pdf";
  39   #print "cmd=$cmd\n";
  40   system($cmd)==0 or die "error executing command $cmd, $!";
  41   $label = chr(ord($label)+1);
  42 }
  43
  44 }
  45
  46 # This version takes n log n time.
  47 if (1) {
  48   $nchunks = int($npages/$chunk_size);
  49   ++$nchunks if ($nchunks*$chunk_size<$npages);
  50   $kmax = log2ceiling($nchunks); # number of binary digits needed to represent nchunks
  51   $label = 'a';
  52   for (my $n=0; $n<$nchunks; $n++) {
  53     my $t = get_chunk($n,$kmax);
  54     my $output_pdf = $input_pdf;
  55     $output_pdf =~ s/\.pdf/${label}.pdf/;
  56     my $cmd = "mv $t $output_pdf";
  57     print "$cmd\n";
  58     system($cmd)==0 or die "error executing command $cmd, $!";
  59     $label = chr(ord($label)+1);
  60   }
  61   system("rm split_temp_*.pdf");
  62 }
  63
  64 sub get_chunk {
  65     my $n = shift;
  66     my $k = shift;
  67     return $input_pdf if $k==0;
  68     my $t = temp_file_name($n,$k);
  69     if (! -e $t) {
  70       my $mommy_n = int($n/2);
  71       my $mommy = get_chunk($mommy_n,$k-1);
  72       my $my_chunk_size = $chunk_size*pow2($kmax-$k);
  73       my $mommy_first_page = first_page($mommy_n,$k-1);
  74       my $mommy_last_page = $mommy_first_page+2*$my_chunk_size-1;
  75       $mommy_last_page = $npages if $mommy_last_page>$npages;
  76       my $mommy_n_pages = $mommy_last_page-$mommy_first_page+1;
  77       my $p = first_page($n,$k)-$mommy_first_page+1;
  78       my $q = $p+$my_chunk_size-1;
  79       $q = $mommy_last_page-$mommy_first_page+1 if $q>$mommy_last_page-$mommy_first_page+1;
  80       #print "mommy_first=$mommy_first_page mommy_last=$mommy_last_page q=$q\n";
  81       my $cmd = "$Bin/pdf_extract_pages.rb $mommy $p-$q $t";
  82       print "$cmd\n";
  83       system($cmd)==0 or die "error executing command $cmd, $!";
  84       #system("echo \"\" >$t");
  85     }
  86     return $t;
  87 }
  88
  89 sub first_page {
  90   my $n = shift;
  91   my $k = shift;
  92   my $my_chunk_size = $chunk_size*pow2($kmax-$k);
  93   return $n*$my_chunk_size+1;
  94 }
  95
  96 sub pow2 {
  97   my $x = shift;
  98   die "negative power in pow2" if $x<0;
  99   return 1 if $x==0;
 100   return 2*pow2($x-1);
 101 }
 102
 103 sub temp_file_name {
 104   my $n = shift;
 105   my $k = shift;
 106   my $e = to_binary_string($n);
 107   while (length($e)<$k) {$e='0'.$e}
 108   return "split_temp_$e.pdf";
 109 }
 110
 111 sub to_binary_string {
 112   my $x = shift;
 113   return '0' if $x==0;
 114   return '1' if $x==1;
 115   return to_binary_string(int($x/2)).to_binary_string($x%2);
 116 }
 117
 118 sub log2ceiling {
 119   my $x = shift;
 120   return 1 if $x<2;
 121   return 1+log2ceiling($x/2);
 122 }