updated git and svn scripts
[xrzperl.git] / wenku2txt
blob606ec922e808297e0b2092c02596313bcba8ef08
1 #!/usr/bin/perl
2 ###APPNAME: wenku2txt
3 ###APPAUTHOR: duel
4 ###APPDATE: 2008-04-27 07:37:08
5 ###APPVER: 0.1
6 ###APPDESC: Convert HTML novel from wenku.com to plain text
7 ###APPUSAGE: [option] directory
8 ###APPEXAMPLE: wenku2txt
9 ###APPOPTION: --multi:Convert to multiple files|--one:Convert to one file(default)
10 use strict;
11 #ENV variable MUST be defined somewhere,
12 #FOR perl to search modules from,
13 #OR nothing will work
14 use lib $ENV{XR_PERL_MODULE_DIR};
16 use MyPlace::Script::Usage qw/help_required help_even_empty/;
17 exit 0 if(help_even_empty($0,@ARGV));
19 my $multi=shift;
20 my $srcd;
21 if($multi eq "--multi") {
22 $multi=1;
23 $srcd=shift;
25 elsif($multi eq "--one") {
26 $multi=0;
27 $srcd=shift;
29 else {
30 $srcd=$multi;
31 $multi=0;
34 die("Invalid directory:$srcd\n") unless(-d $srcd and -r "$srcd/index.htm");
36 require HTML::TreeBuilder;
37 no warnings;
39 my $title;
40 my @pages;
41 my $tree = HTML::TreeBuilder->new();
42 $tree->parse_file("$srcd/index.htm");
44 my @tags = $tree->find("title");
45 $title=$tags[0]->as_text() if(@tags);
46 unless($title) {
47 require MyPlace::Filename;
48 $title=MyPlace::Filename::get_basename(MyPlace::Filename::get_fullname($srcd));
51 foreach my $node ($tree->find("a")) {
52 my $href = $node->attr("href");
53 my $text = $node->as_text();
54 if($href =~ /\d+\.htm$/) {
55 push @pages,[$href,$text];
59 use MyPlace::HTML::Convertor;
61 sub htm2txt($) {
62 my $file=shift;
63 my $result = text_from_file( $file,["mText"],[] );
64 if($result) {
65 return $result;
67 else {
68 print STDERR "Error when converting $file\n";
69 return [];
73 unless($multi) {
74 print STDERR "Generating $title.txt ...\n";
75 open FO,">","$title.txt";
76 print FO "\n$title\n\n\n";
77 foreach my $idx(0 .. $#pages) {
78 print FO $idx + 1,".",$pages[$idx]->[1],"\n";
80 print FO "\n\n";
83 my $numlen=length(@pages);
84 foreach my $idx(0 .. $#pages) {
85 my $file = $pages[$idx]->[0];
86 my $name = $pages[$idx]->[1];
87 $idx++;
88 $idx = "0"x($numlen-length($idx)) . $idx;
89 if($multi) {
90 print STDERR "Generating \"$title - $idx - $name.txt\"\n";
91 open FO,">","$title - $idx - $name.txt";
92 print FO "\n$title\n\n\n$name\n\n",@{htm2txt("$srcd/$file")};
93 close FO;
95 else {
96 print FO "\n\n$name\n\n",@{htm2txt("$srcd/$file")};
100 close FO unless($multi);