| 1 | |
|---|
| 2 | #!/usr/local/bin/ruby -w |
|---|
| 3 | |
|---|
| 4 | require 'rubygems' |
|---|
| 5 | require 'threadpool' |
|---|
| 6 | require 'pp' |
|---|
| 7 | require 'hpricot' |
|---|
| 8 | require 'open-uri' |
|---|
| 9 | |
|---|
| 10 | $DEBUG = true |
|---|
| 11 | processed = [] |
|---|
| 12 | |
|---|
| 13 | pages = %w( http://www.ibm.com http://www.axisforex.com.au http://www.abc.net.au http://www.cnn.com http://www.jobx.com http://www.jobx.com.au) |
|---|
| 14 | print "#{Time.now}\n" |
|---|
| 15 | |
|---|
| 16 | pages.each do |i| |
|---|
| 17 | print "started::url:[#{i}]\n" |
|---|
| 18 | begin |
|---|
| 19 | page = open(i) { |f| Hpricot(f) } |
|---|
| 20 | title = (page.at("title").nil?) ? "(none)" : page.at("title").inner_html.sub(%r{<body.*?>(.*?)</body>}mi, '\1').gsub(/<.*?>/m, ' ').gsub(%r{(\n\s*){2}}, "\n\n").strip |
|---|
| 21 | h1 = (page.at("h1").nil?) ? "(none)" : page.at("h1").inner_html.sub(%r{<body.*?>(.*?)</body>}mi, '\1').gsub(/<.*?>/m, ' ').gsub(%r{(\n\s*){2}}, "\n\n").strip |
|---|
| 22 | print "job #{i} title:[#{title}]\th1:[#{h1}].\n" |
|---|
| 23 | processed << i |
|---|
| 24 | rescue |
|---|
| 25 | print "error #{i}\n" |
|---|
| 26 | end |
|---|
| 27 | end |
|---|
| 28 | pp processed |
|---|
| 29 | print "#{Time.now}\n" |
|---|