
#!/usr/local/bin/ruby -w

require 'rubygems'
require 'threadpool'
require 'pp'
require 'hpricot'
require 'open-uri'

$DEBUG = true
processed = []

pages = %w( http://www.ibm.com http://www.axisforex.com.au http://www.abc.net.au http://www.cnn.com http://www.jobx.com http://www.jobx.com.au)
print "#{Time.now}\n"

pages.each do |i|
	print "started::url:[#{i}]\n"
	begin
		page = open(i) { |f| Hpricot(f) }
		title = (page.at("title").nil?) ? "(none)" : page.at("title").inner_html.sub(%r{<body.*?>(.*?)</body>}mi, '\1').gsub(/<.*?>/m, ' ').gsub(%r{(\n\s*){2}}, "\n\n").strip
		h1 = (page.at("h1").nil?) ? "(none)" : page.at("h1").inner_html.sub(%r{<body.*?>(.*?)</body>}mi, '\1').gsub(/<.*?>/m, ' ').gsub(%r{(\n\s*){2}}, "\n\n").strip
		print "job #{i} title:[#{title}]\th1:[#{h1}].\n"
		processed << i
	rescue
		print "error #{i}\n"
	end
end
pp processed
print "#{Time.now}\n"
