2 # Program to replace commonly used html, extract out top and bottom parts
3 # of pages, which are roughly similar, and recompose them in the other end
4 # By Stian Haklev (shaklev@gmail.com), 2007
5 # Released under MIT and GPL licenses
7 require 'htmlshrinker-data'
10 attr_accessor :before, :after
11 def initialize(template, archive)
12 @before, @after = template.split(20.chr)
13 @before.sub!(/\<title>(.*?)\<\/title>/,'<title>TITLE</title>')
14 @before.gsub!('./', '/')
15 @after.gsub!(/href="([^\/])/, 'href="/\1')
16 # @before.gsub!(/href="[^.]/, 'href="/\1')
17 @before.sub!(/\<h1 class\=\"firstHeading\">(.*?)\<\/h1>/, '<h1 class="firstHeading">TITLE</h1>')
18 @after.sub!(/\<li id="f-credits">(.*?)\<\/li>/, '')
22 title, languages, text = text.split("\n", 3)
23 # p languages.split(":")
24 HTMLShrinker_data::Replacements.each {|x, y| text.gsub!(y, x)}
25 #gsub(/TITLE/, title).gsub("POINTER", @csstext + @jstext)
26 return @before.gsub('TITLE', title) + text + @after
32 if text =~ /\<meta http-equiv=\"Refresh\" content=\"0\;url=(.*?)\" \/\>/
33 url = url_unescape(Regexp::last_match[1].gsub('../', ''))
36 title = (text.match(/"firstHeading">(.*?)\<\/h1>/m) ? Regexp::last_match[1] : "Unnamed")
38 # if text.match(/<div id="p-lang" class="portlet">(.*?)\<\/div>/)
39 # languages = Regexp::last_match[1]
41 # languages.scan(/<a href="(.*?)">/) do |match|
42 # match = match[0].gsub("../", "")
43 # lang, url = match.split("/",2)
46 # languages = langs.to_a.join(":")
49 text = Regexp::last_match[1] if text.match(/ start content -->(.*?)\<\!-- end content /m)
50 HTMLShrinker_data::Replacements.each {|x, y| text.gsub!(x, y) }
51 ZUtil::strip_whitespace(text)
52 text.gsub!(/<img src=(.*?)>/, "")
53 return [title, languages, text].join("\n")
56 # takes an example html file, extracts the top and bottom, does some replacements
57 # - this can later be stored and handed to HTMLShrinker at initialization
58 def extract_template(text)
59 before = Regexp::last_match.pre_match if text.match(/<\!-- start content -->/)
60 after = Regexp::last_match.post_match if text.match(/<\!-- end content -->/)
61 return [before, after].join(20.chr)