4 ###APPDATE: 2008-04-27 07:37:08
6 ###APPDESC: Convert HTML novel from wenku.com to plain text
7 ###APPUSAGE: [option] directory
8 ###APPEXAMPLE: wenku2txt
9 ###APPOPTION: --multi:Convert to multiple files|--one:Convert to one file(default)
11 #ENV variable MUST be defined somewhere,
12 #FOR perl to search modules from,
14 use lib
$ENV{XR_PERL_MODULE_DIR
};
16 use MyPlace
::Script
::Usage qw
/help_required help_even_empty/;
17 exit 0 if(help_even_empty
($0,@ARGV));
21 if($multi eq "--multi") {
25 elsif($multi eq "--one") {
34 die("Invalid directory:$srcd\n") unless(-d
$srcd and -r
"$srcd/index.htm");
36 require HTML
::TreeBuilder
;
41 my $tree = HTML
::TreeBuilder
->new();
42 $tree->parse_file("$srcd/index.htm");
44 my @tags = $tree->find("title");
45 $title=$tags[0]->as_text() if(@tags);
47 require MyPlace
::Filename
;
48 $title=MyPlace
::Filename
::get_basename
(MyPlace
::Filename
::get_fullname
($srcd));
51 foreach my $node ($tree->find("a")) {
52 my $href = $node->attr("href");
53 my $text = $node->as_text();
54 if($href =~ /\d+\.htm$/) {
55 push @pages,[$href,$text];
59 use MyPlace
::HTML
::Convertor
;
63 my $result = text_from_file
( $file,["mText"],[] );
68 print STDERR
"Error when converting $file\n";
74 print STDERR
"Generating $title.txt ...\n";
75 open FO
,">","$title.txt";
76 print FO
"\n$title\n\n\n";
77 foreach my $idx(0 .. $#pages) {
78 print FO
$idx + 1,".",$pages[$idx]->[1],"\n";
83 my $numlen=length(@pages);
84 foreach my $idx(0 .. $#pages) {
85 my $file = $pages[$idx]->[0];
86 my $name = $pages[$idx]->[1];
88 $idx = "0"x
($numlen-length($idx)) . $idx;
90 print STDERR
"Generating \"$title - $idx - $name.txt\"\n";
91 open FO
,">","$title - $idx - $name.txt";
92 print FO
"\n$title\n\n\n$name\n\n",@
{htm2txt
("$srcd/$file")};
96 print FO
"\n\n$name\n\n",@
{htm2txt
("$srcd/$file")};
100 close FO
unless($multi);