2 ###APPNAME: thread_persist_content
4 ###APPDATE: 2009-01-29 02:50:08
6 ###APPDESC: thread_persist_content
7 ###APPUSAGE: [urls|files]
8 ###APPEXAMPLE: thread_persist_content
12 #ENV variable MUST be defined somewhere,
13 #FOR perl to search modules from,
15 use lib
$ENV{XR_PERL_MODULE_DIR
};
17 use MyPlace
::Script
::Usage qw
/help_required help_even_empty/;
18 exit 0 if(help_required
($0,@ARGV));
19 #exit 0 if(help_even_empty($0,@ARGV));
21 use HTML
::TreeBuilder
;
22 use MyPlace
::HTML
::Convertor
;
23 use Encode qw
/decode/;
24 binmode STDERR
,"utf8";
30 unless(@files) {while(<>){chomp;push @files,$_;}}
32 die("No files specified\n") unless(@files);
36 my ($title) = $tree->look_down("_tag","title");
38 return $title->as_text();
43 sub uniq_filename
($$) {
47 while(-f
"$base$inc$ext") {
48 $inc = $inc ?
$inc + 1 : 1;
50 return "$base$inc$ext";
55 return 1 if(-f
$file);
56 open FI
,"-|","netcat_autologin",$url,$user,$pass or return undef;
57 open FO
,">",$file or return undef;
66 return 1 if($dir_check{$dir});
69 print STDERR
"$!\n";return 0;
81 foreach my $file (@files) {
82 print STDERR
"[$idx/$count] Process $file ...\n";
84 if($file =~ /^http:\/\
//i) {
86 $filename =~ s/^.*\///;
87 $filename ||= "index.html";
88 mkdir_check
("src") or die();
89 $filename = "src/$filename";
93 print STDERR
"Downloading $file->$filename ...\n";
94 if(download_url
($file,$filename)) {
95 # print STDERR "[OK]\n";
98 unlink $filename if(-f
$filename);
99 # print STDERR "[Failed]\n";
105 print STDERR
"File not exists : $file\t[Skipped]\n";
111 push @data,decode
("gbk",$_);
114 my $tree=HTML
::TreeBuilder
->new_from_content(@data);
115 my $title = get_title
($tree);
121 $title =~ s/\..*$//g;
123 $title =~ s/\[.*$//g;
125 $title =~ s/[\/\\\!\*\+]//g
;
128 ($body) = $tree->look_down("id",$body_id);
129 ($body) = $tree->look_down("class",$body_id) unless($body);
131 $body = $tree unless($body);
132 print STDERR
"Found Content Title: $title ";
133 my @images = $body->look_down(_tag
=>"img",src
=>qr/\.jpg$/i);
135 print STDERR
"Persist As Images\n";
136 mkdir_check
("img") or die();
137 my $imgd = "img/$title";
138 mkdir_check
($imgd) or next;
140 open FO
,"|-","batchget" or die("$!\n");
141 print FO
$_->attr('src'),"\n" foreach(@images);
146 mkdir_check
("txt") or die();
147 my $dst = "txt/$title.txt";
148 print STDERR
"Persist As Text -> $dst ...\t";
150 print STDERR
"[Skipped(File exists)]\n";
153 open FO
,">:utf8",$dst or die("$!\n");
154 my $text = text_from_node
($body);
155 print FO
$title,"\n","\n";
156 print FO @
{$text},"\n" if($text);
158 print STDERR
"[OK]\n";