#!/usr/bin/env ruby
#######################################################################
# rss_grab.rb - get contents of RSS feed #
# by Paul Duncan <pabs@pablotron.org> #
# #
# #
# This file is distributed with Raggle, please see the Raggle page at #
# http://www.pablotron.org/software/raggle/ for the latest version of #
# this software. #
# #
# #
# Copyright (C) 2003 Paul Duncan, and various contributors. #
# #
# Permission is hereby granted, free of charge, to any person #
# obtaining a copy of this software and associated documentation #
# files (the "Software"), to deal in the Software without #
# restriction, including without limitation the rights to use, copy, #
# modify, merge, publish, distribute, sublicense, and/or sell copies #
# of the Software, and to permit persons to whom the Software is #
# furnished to do so, subject to the following conditions: #
# #
# The above copyright notice and this permission notice shall be #
# included in all copies of the Software, its documentation and #
# marketing & publicity materials, and acknowledgment shall be given #
# in the documentation, materials and software packages that this #
# Software was used. #
# #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY #
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF #
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION #
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #
#######################################################################
# load necessary modules
require 'rexml/document'
require 'net/http'
begin
require 'net/https'
$HAVE_SSL = true
rescue
$HAVE_SSL = false
end
# check command-line options
unless ARGV.size > 0
$stderr.puts "Usage: #$0 [feed_url]"
exit -1
end
module Feed
# Feed::Item struct definition
Item = Struct.new :title, :link, :desc
#
# Return the contents of a URL
#
def Feed::get_url(url)
port = 80
use_ssl = false
if url =~ /^https:/
raise 'HTTPS support requires OpenSSL-Ruby' unless $HAVE_SSL
use_ssl = true
end
# strip 'http://' prefix from URL
url.gsub!(/^\w+?:\/\//, '') if url =~ /^\w+?:\/\//
# get host and path portions of url
raise "Couldn't parse URL: \"#{url}\"" unless url =~ /^(.+?)\/(.*)$/
host, path = $1, $2
# check for port in url
if host =~ /:(\d+)$/
port = $1
host.gsub!(/:(\d+)$/, '')
end
# init http connection
http = Net::HTTP.start(host, port)
http.use_ssl = use_ssl if $HAVE_SSL
raise "Couldn't connect to host \"#{host}:#{port}\"" unless http
# get result
resp, ret = nil, ''
begin
resp, ret = http.get('/' << path)
rescue
resp = $!.response
# handle redirect
if resp.code =~ /3\d{2}/
ret = get_url resp['Location']
else
raise "HTTP Error: #$!"
end
end
# close HTTP connection
# Note: if we don't specify this, then the connection is pooled for the
# HTTP/1.1 spec (do we prefer that kind of behavior? maybe I should make in
# an option)
http.finish
# return URL content
ret
end
class Channel
attr_accessor :title, :link, :desc, :lang, :items
def initialize(url)
parse_rss_url url
end
#
# Parse an RSS URL and return a FeedChannel object (which contains,
# among other things, an array of feed_item structs)
#
def parse_rss_url(url)
begin
content = Feed::get_url url
rescue
raise "Couldn't get URL \"#{url}\": #$!."
end
# parse URL content
doc = REXML::Document.new content
# get channel info
e = nil
@title = e.text if e = doc.root.elements['//channel/title']
@link = e.text if e = doc.root.elements['//channel/link']
@desc = e.text if e = doc.root.elements['//channel/description']
@lang = e.text if e = doc.root.elements['//channel/language']
# build list of feed items
@items = []
doc.root.elements.each('//item') { |e|
@items << Feed::Item.new(e.elements['title'].text,
e.elements['link'].text,
e.elements['description'].text)
}
end
end
end
# get URL specified on the command-line
chan = Feed::Channel.new ARGV.shift
puts 'Channel Information:',
" Title: #{chan.title}",
" Link: #{chan.link}",
" Desc: #{chan.desc}",
" Lang: #{chan.lang}",
''
# iterate through and print each feed item
chan.items.each { |item| puts item.title << ': ' << item.link }
syntax highlighted by Code2HTML, v. 0.9.1