#!/usr/bin/env ruby

#######################################################################
# rss_grab.rb - get contents of RSS feed                              #
# by Paul Duncan <pabs@pablotron.org>                                 #
#                                                                     #
#                                                                     #
# This file is distributed with Raggle, please see the Raggle page at #
# http://www.pablotron.org/software/raggle/ for the latest version of #
# this software.                                                      #
#                                                                     #
#                                                                     #
# Copyright (C) 2003 Paul Duncan, and various contributors.           #
#                                                                     #
# Permission is hereby granted, free of charge, to any person         #
# obtaining a copy of this software and associated documentation      #
# files (the "Software"), to deal in the Software without             #
# restriction, including without limitation the rights to use, copy,  #
# modify, merge, publish, distribute, sublicense, and/or sell copies  #
# of the Software, and to permit persons to whom the Software is      #
# furnished to do so, subject to the following conditions:            #
#                                                                     #
# The above copyright notice and this permission notice shall be      #
# included in all copies of the Software, its documentation and       #
# marketing & publicity materials, and acknowledgment shall be given  #
# in the documentation, materials and software packages that this     #
# Software was used.                                                  #
#                                                                     #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,     #
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF  #
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND               #
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY    #
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF          #
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION  #
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.     #
#######################################################################

# load necessary modules
require 'rexml/document'
require 'net/http'
begin 
  require 'net/https'
  $HAVE_SSL = true
rescue
  $HAVE_SSL = false
end


# check command-line options
unless ARGV.size > 0
  $stderr.puts "Usage: #$0 [feed_url]"
  exit -1
end

module Feed
  # Feed::Item struct definition
  Item = Struct.new :title, :link, :desc

  #
  # Return the contents of a URL
  #
  def Feed::get_url(url)
    port = 80
    use_ssl = false

    if url =~ /^https:/
      raise 'HTTPS support requires OpenSSL-Ruby' unless $HAVE_SSL
      use_ssl = true
    end

    # strip 'http://' prefix from URL
    url.gsub!(/^\w+?:\/\//, '') if url =~ /^\w+?:\/\//

    # get host and path portions of url
    raise "Couldn't parse URL: \"#{url}\"" unless url =~ /^(.+?)\/(.*)$/
    host, path = $1, $2

    # check for port in url
    if host =~ /:(\d+)$/
      port = $1 
      host.gsub!(/:(\d+)$/, '')
    end

    # init http connection
    http = Net::HTTP.start(host, port)
    http.use_ssl = use_ssl if $HAVE_SSL
    raise "Couldn't connect to host \"#{host}:#{port}\"" unless http

    # get result
    resp, ret = nil, ''
    begin
      resp, ret = http.get('/' << path)
    rescue 
      resp = $!.response

      # handle redirect
      if resp.code =~ /3\d{2}/
        ret = get_url resp['Location']
      else
        raise "HTTP Error: #$!"
      end
    end

    # close HTTP connection
    # Note: if we don't specify this, then the connection is pooled for the
    # HTTP/1.1 spec (do we prefer that kind of behavior?  maybe I should make in
    # an option)
    http.finish

    # return URL content
    ret
  end

  class Channel
    attr_accessor :title, :link, :desc, :lang, :items

    def initialize(url)
      parse_rss_url url
    end
      
    # 
    # Parse an RSS URL and return a FeedChannel object (which contains,
    # among other things, an array of feed_item structs)
    #
    def parse_rss_url(url)
      begin
        content = Feed::get_url url
      rescue
        raise "Couldn't get URL \"#{url}\": #$!."
      end

      # parse URL content
      doc = REXML::Document.new content

      # get channel info
      e = nil
      @title = e.text if e = doc.root.elements['//channel/title']
      @link = e.text if e = doc.root.elements['//channel/link']
      @desc = e.text if e = doc.root.elements['//channel/description']
      @lang = e.text if e = doc.root.elements['//channel/language']
  
      # build list of feed items
      @items = []
      doc.root.elements.each('//item') { |e| 
        @items << Feed::Item.new(e.elements['title'].text,
                                 e.elements['link'].text,
                                 e.elements['description'].text)
      }
    end
  end
end

# get URL specified on the command-line
chan = Feed::Channel.new ARGV.shift

puts 'Channel Information:', 
     "  Title: #{chan.title}",
     "  Link:  #{chan.link}",
     "  Desc:  #{chan.desc}",
     "  Lang:  #{chan.lang}",
     ''

# iterate through and print each feed item
chan.items.each { |item| puts item.title << ': ' << item.link }


syntax highlighted by Code2HTML, v. 0.9.1