add LM 5 hw, caryatid
[light-and-matter.git] / scripts / split_book.pl
blob3df3eb60e084bc9fdfba6b65fa8955bf715ea9dd
1 #!/usr/bin/perl
3 # This script has two algorithms in it, a n^2 and an n log n. I haven't
4 # benchmarked it, but I think they should perform the same on n=2,
5 # while the n log n one should do worse on n=4 by a factor of 12/10,
6 # but better on n=15 (Simple Nature's current size) by
7 # about a factor of 2. I should be able to improve
8 # the performance of the n log n algorithm by making it into a base-4
9 # algorithm instead of base-2, but the improvement would only be 20%.
11 use strict;
12 use POSIX;
13 use FindBin qw($Bin);
15 our ($input_pdf,$nchunks,$kmax,$label);
17 my $input_pdf = shift @ARGV; # first command-line argument
18 die "input file $input_pdf not found" unless -e $input_pdf;
19 my $chunk_size = 50;
20 if (@ARGV) {
21 $chunk_size = shift @ARGV; # optional second arg, pages per chunk
23 die "illegal chunk size=$chunk_size" unless $chunk_size>=1;
25 my $npages = `$Bin/pdf_page_count.rb $input_pdf`;
26 unless ($npages>=1 && $npages<=10000) {die "split_book.pl: npages=$npages fails sanity check for input file $input_pdf"}
28 # The following works, but takes n^2 time, where n is the number of pages.
29 if (0) {
31 $label = 'a';
33 for (my $p=1; $p<=$npages; $p+=$chunk_size) {
34 my $output_pdf = $input_pdf;
35 $output_pdf =~ s/\.pdf/${label}.pdf/;
36 my $q = $p+$chunk_size-1;
37 if ($q>$npages) {$q=$npages}
38 my $cmd = "$Bin/pdf_extract_pages.rb $input_pdf $p-$q $output_pdf";
39 #print "cmd=$cmd\n";
40 system($cmd)==0 or die "error executing command $cmd, $!";
41 $label = chr(ord($label)+1);
46 # This version takes n log n time.
47 if (1) {
48 $nchunks = int($npages/$chunk_size);
49 ++$nchunks if ($nchunks*$chunk_size<$npages);
50 $kmax = log2ceiling($nchunks); # number of binary digits needed to represent nchunks
51 $label = 'a';
52 for (my $n=0; $n<$nchunks; $n++) {
53 my $t = get_chunk($n,$kmax);
54 my $output_pdf = $input_pdf;
55 $output_pdf =~ s/\.pdf/${label}.pdf/;
56 my $cmd = "mv $t $output_pdf";
57 print "$cmd\n";
58 system($cmd)==0 or die "error executing command $cmd, $!";
59 $label = chr(ord($label)+1);
61 system("rm split_temp_*.pdf");
64 sub get_chunk {
65 my $n = shift;
66 my $k = shift;
67 return $input_pdf if $k==0;
68 my $t = temp_file_name($n,$k);
69 if (! -e $t) {
70 my $mommy_n = int($n/2);
71 my $mommy = get_chunk($mommy_n,$k-1);
72 my $my_chunk_size = $chunk_size*pow2($kmax-$k);
73 my $mommy_first_page = first_page($mommy_n,$k-1);
74 my $mommy_last_page = $mommy_first_page+2*$my_chunk_size-1;
75 $mommy_last_page = $npages if $mommy_last_page>$npages;
76 my $mommy_n_pages = $mommy_last_page-$mommy_first_page+1;
77 my $p = first_page($n,$k)-$mommy_first_page+1;
78 my $q = $p+$my_chunk_size-1;
79 $q = $mommy_last_page-$mommy_first_page+1 if $q>$mommy_last_page-$mommy_first_page+1;
80 #print "mommy_first=$mommy_first_page mommy_last=$mommy_last_page q=$q\n";
81 my $cmd = "$Bin/pdf_extract_pages.rb $mommy $p-$q $t";
82 print "$cmd\n";
83 system($cmd)==0 or die "error executing command $cmd, $!";
84 #system("echo \"\" >$t");
86 return $t;
89 sub first_page {
90 my $n = shift;
91 my $k = shift;
92 my $my_chunk_size = $chunk_size*pow2($kmax-$k);
93 return $n*$my_chunk_size+1;
96 sub pow2 {
97 my $x = shift;
98 die "negative power in pow2" if $x<0;
99 return 1 if $x==0;
100 return 2*pow2($x-1);
103 sub temp_file_name {
104 my $n = shift;
105 my $k = shift;
106 my $e = to_binary_string($n);
107 while (length($e)<$k) {$e='0'.$e}
108 return "split_temp_$e.pdf";
111 sub to_binary_string {
112 my $x = shift;
113 return '0' if $x==0;
114 return '1' if $x==1;
115 return to_binary_string(int($x/2)).to_binary_string($x%2);
118 sub log2ceiling {
119 my $x = shift;
120 return 1 if $x<2;
121 return 1+log2ceiling($x/2);