#!/usr/bin/env ruby

require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'icalendar'

class Page
  SCHEDULE_ROOT="html/body/div[2]/table/tr"
  attr_accessor :url
  #use a file name if the url is already downloaded (good for testing)
  attr_accessor :cal
  
  def initialize(url=nil)
    @url=url
    @cal = Icalendar::Calendar.new
    @response=nil
  end
  
  def clear
    @cal.clear
  end

  def absolute_url(base_url,event_url)
    if event_url.start_with? "http"
      #absolute - leave it as is
      event_url
    elsif event_url.start_with? "/"
      #tack on the site
      event_url=base_url.sub(/(https?:\/\/[^\/]\/).*/,'\1') + event_url
    else #tack on the site up to the current page name
      event_url=base_url.sub(/(.*\/).*/,'\1') + event_url
    end
  end

  def read(file_name=nil)
    if file_name.nil? || File.file?(file_name) == false
      # open-uri RDoc: http://stdlib.rubyonrails.org/libdoc/open-uri/rdoc/index.html
      open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
        "From" => "email@addr.com",
        "Referer" => "http://reflectivepixel.com/") do |f|
        @response = f.read

        #cache it for future use
        File.new(file_name,'w').write @response unless file_name.nil?
      end   
    else
      @response=File.new(file_name).read
    end
  end

  def write(file_name)
    File.open(file_name, 'w') do |f|
      f.puts to_ical
    end
  end

  def parse
    doc = Hpricot(@response)

    #hold the date header
    cur_date=''  
    #location of the schedule stuff container (typically the parent table or div)
    (doc/SCHEDULE_ROOT).each do |day|
      #get the headers
      
      cols=day/"td"
      header=(day/"td/strong").first
      
      #header
      if header.nil? == false
        #dates are single elements in headers
        if cols.count == 1
          cur_date=header
          cur_date=cur_date.inner_html.split(':').first unless cur_date.nil?
        end
      elsif cur_date.empty? == false #output events (if we know a date for the event)
        #0 has date/time, 1 has description, 2 has author (optional)

        next if cols.length < 2 #must at least have a description

        day_range=cols[0].inner_html.split("-")
        summary=cols[1].inner_text
        url_node = (cols[1]/"a").first
        next if summary =~ /^break\s/i

        e=@cal.event
        #e.timestamp = DateTime.now
        e.dtstart=DateTime.parse("#{cur_date} #{day_range[0]}")
        e.dtend=DateTime.parse("#{cur_date} #{day_range[1]}")
        e.summary=summary

        unless url_node.nil? or url_node.empty?
          e.url absolute_url(@url,url_node.attributes['href']), {"VALUE"=>"URI"}
        end

        if cols[2].nil? == false && cols[2].inner_html != '&nbsp;'
          author=cols[2].inner_text 
          e.description="#{summary} by #{author}"
        else
          e.description=summary
        end

      end #an event
    end #each tr row (event or header)
  end
  
  def to_ical
    @cal.to_ical
  end
end

if __FILE__ == $0
  begin
    p = Page.new("http://www.voicesthatmatter.com/ruby2008/schedule.aspx")
    p.read 'schedule.html'
    p.parse
    p.write 'vtm_ruby2008.ics'

  rescue Exception => e
    puts e
    puts e.backtrace
  end
end
