[JDEV] New Transport Architecture - Idea

Jared Rhine jared at wordzoo.com
Mon Feb 5 14:14:18 CST 2001


[Citation date: Mon, 5 Feb 2001 11:46:17 -0800]

>>>>> Mark == Mark Zamoyta <jdev at jabber.org>

    Mark> I was unaware of what RSS was, but I'll look into it!  The
    Mark> key thing is to leverage the content and services of
    Mark> existing websites, and make it instantly available to Jabber
    Mark> users.

I'm currently using the script below to act as a pseudo-user which
collects RSS information as posts it as a headline to my Jabber
server, which shows up on the headlines tab of my winjab client.  To
receive headlines, I subscribe to the 'headline' user's presence.
It's based on the framework by DJ Adams at:

  http://www.pipetree.com/jabber/

I too agree that this is a superior approach to new transports.  I'm
not familiar with the RSS transport, though.  I think I'll prefer to
implement a pseudo-user client-based approach like this so I can have
simple control over posting logic, such as filtering (and mapping from
moreover.com 'click here' URLs to the actual URL).

Note, although DJ Adams' original framework was presence-sensitive
(only sent headlines if you were available) this version isn't (I like
to wake up in the morning with a headlines tab full of new stories).
Don't let this thing run too unattended, as it will happily keep
posting headlines even if you're not logged in.

-- begin --

#!/usr/bin/perl

use Net::Jabber;
use LWP::Simple;
use LWP::UserAgent;
use XML::RSS;
use File::Path;
use URI::Escape;

use strict;

use constant CACHEDIR => '/tmp/jabber-headlines-cache';
use constant DEBUGFILE => '/tmp/jabber-headlines.log';

use constant SERVER   => 'xxxxx;  # Obscured
use constant PORT     => 5222;
use constant USER     => 'headlines';
use constant PASSWORD => 'xxxxx; # Obscured
use constant RESOURCE => 'bot';
use constant DELAY    => 600;
use constant VERBOSE  => 3;

my %present;
my %cache;
my %fullcache;

my %sources = (
               'http://www.jabber.org/rss/articles.xml' => 5,
               'http://slashdot.org/slashdot.rdf' => 2,
               'http://search.cpan.org/recent.rdf' => 4,
               'http://udell.roninhouse.com/udell.rdf' => 7,
               'http://freshmeat.net/backend/fm.rdf' => 9,
               'http://www.mozilla.org/news.rdf' => 9,
               'http://www.newsforge.com/newsforge.rdf' => 6,
               'http://www.wired.com/news_drop/netcenter/netcenter.rdf' => 3,
               'http://www.moreover.com/cgi-local/page?index_home+rss' => 2,
#              'http://www.moreover.com/cgi-local/page?index_computerservices+rss' => 3,
               'http://www.moreover.com/cgi-local/page?index_enterprisecomputing+rss' => 3,
#              'http://www.moreover.com/cgi-local/page?index_e-commerce+rss' => 3,
               'http://www.moreover.com/cgi-local/page?index_personaltechnology+rss' => 2,
               'http://www.moreover.com/cgi-local/page?index_wireless+rss' => 2,
               'http://www.xenite.org/channels/lordoftherings.rss' => 8,
               'http://xml.com/xml/scriptingnews.rdf' => 4,
# Betanews?
               'http://www.nwfusion.com:8080/rss/wireless/query.html?qt=%2Bwireless&nh=10&rf=1' => 5,
#              'http://www.xmltree.com/whatsnew/rss.cfm' => 17,
               'http://www.moreover.com/cgi-local/page?index_xml+rss' => 7,
               'http://ilrt.org/discovery/rdf/resources/rss.rdf' => 19,
               'http://www.xmlhack.com/rsscat.php' => 3,
               'http://rootprompt.org/rss/' => 4,
               'http://www.apacheweek.com/issues/apacheweek-headlines.xml' => 8,
               'http://www.oreillynet.com/pub/q/32' => 7,
               'http://www.soap-wrc.com/webservices/rss.asp' => 10,
               'http://www.kuro5hin.org/backend.rdf' => 9,
               'http://cnn.com/cnn.rss' => 5,
               'http://p.moreover.com/cgi-local/page?index_java+rss' => 8,
               'http://p.moreover.com/cgi-local/page?index_science+rss' => 9,
               'http://memepool.com/memepool.rss' => 11,
               'http://www.geekpress.com/index.xml' => 13,

              );

# Redirect stdout
my $debug = DEBUGFILE;
open STDERR, ">$debug" or die "Can't redirect stdout: $!";

my @sources = keys %sources;
my $count = 1;

mkpath(CACHEDIR,1,0700);

my $connection = Net::Jabber::Client->new();

log3("Making connection to Jabber server");
$connection->Connect( hostname => SERVER,
                      port     => PORT ) 
  or die "Cannot connect ($!)\n";

log3("Attempting Ident/Auth");
my @result = $connection->AuthSend( username => USER,
                                    password => PASSWORD,
                                    resource => RESOURCE );

if ($result[0] ne "ok") {
  die "Ident/Auth with server failed: $result[0] - $result[1]\n";
}

log3("Setting headline handler");
$SIG{ALRM} = \&do_headlines;

log3("Setting presence handler");
$connection->SetCallBacks( presence => \&handle_presence );

log3("Requesting roster");
$connection->RosterGet();

log3("Sending presence");
$connection->PresenceSend();

log3("Retrieving RSS for first time and setting alarm");
do_headlines();

log3("Entering main loop");
while(defined($connection->Process())) { }

log3("Cancelling alarm");
alarm(0);

print "ERROR: The connection was killed...\n";

exit(0);

sub do_headlines {

  $count += 1;
#  foreach my $source (@sources) {
  foreach my $source (grep { ($count/$sources{$_}) == int($count/$sources{$_}) } (sort keys %sources)) {

    # Retrieve the RSS
    log3("Getting $source");
    my $data = get($source);

    # Skip if cannot retrieve
    unless (defined($data)) {
      log1("Cannot retrieve $source - skipping");
      next;
    }

    # Process any messages
    $connection->Process(1);

    my $rss = XML::RSS->new();

    # Parse the RSS and get the items
    $rss->parse($data);
    my @items = @{$rss->{items}};

    # Discover any new items
    log3("Looking for new items");
    foreach my $item (@items) {

      # Stop looking if we reach an item we've
      # already seen
      last if exists $cache{$source} and $cache{$source} eq $item->{link};

      my $orglink = $item->{link};
      my $link = $orglink;

      # Check cache
      my $file = CACHEDIR . "/" . uri_escape($link,"^A-Za-z0-9\\.:\\?");
      if ($fullcache{$link} or -r $file) {
        log2("Skipping because previously seen: $link");
        next;
      }

      log2("New item from $source - $item->{title}");

      # Create headline message
      my $msg = Net::Jabber::Message->new();
  
      $msg->SetMessage(
                        type     => 'headline',
                        subject  => $item->{title}, 
                        body     => $item->{description}, 
                      );

      # Add to cache
      log3("Adding $link ($file) to cache");
      $fullcache{$link} = 1;
      open TOUCH, ">$file" or warn "Couldn't create/touch cache file $file: $!";
      close TOUCH;

      my $oob = $msg->NewX('jabber:x:oob');
      $oob->SetDesc($item->{title});

      # If moreover, find the real URL.
      if ($orglink =~ m{^http://[a-z]\.moreover\.com/}) {
        my $ua = LWP::UserAgent->new;

        my $request = HTTP::Request->new('GET',$orglink);
        my $response = $ua->simple_request($request);
        if ($response->header('location')) {
          log3("Successful retrieval redirected of $orglink");
          $link = $response->header('location');
        } else {
          my $headers = $response->headers_as_string;
          log3("Unsuccessful retrieval of $orglink");
          log3("Headers: $headers");
          $link = $orglink;
        }
        log2("Replacing moreover URL $orglink with $link");
      }

      $oob->SetURL($link);
 
      my @sendees;

      # Send the headline to all that are present
      foreach my $recipient (keys %present) {
        sleep 2;
        $msg->SetTo($recipient);
        $connection->Send($msg);
        push @sendees, $recipient;
      }
            
      log2("Sent to ".(join(", ", @sendees) || "nobody"));

       # This will prevent all the items being
      # counted as new the first time through
      # the loop (but allows the first item in
      # the RSS to be sent).
      #last unless exists($cache{$source});
    }

    # Remember the latest new item
    $cache{$source} = $items[0]->{link};
  }

  log3("Setting alarm");
  alarm(DELAY);
}

sub handle_presence {
 
  my $presence = new Net::Jabber::Presence(@_);
  my $jid  = $presence->GetFrom();
  my $show = $presence->GetShow();
  my $type = $presence->GetType();

  $jid =~ s!\/.*$!!; # remove any resource suffix from JID

  log3("Presence from $jid:\n".$presence->GetXML());
 
  # Subscription request:
  # Accept, and request subscription to them.
  if ($type eq "subscribe") {
    log3("$jid requests subscription");
    $connection->Send($presence->Reply(type => 'subscribed'));
    $connection->Send($presence->Reply(type => 'subscribe'));
  }

  # Request to unsubscribe:
  # Acknowledge, and request unsubscription from them.
  # Don't forget to remove them from the present list, too.
  if ($type eq "unsubscribe") {
    log3("$jid requests unsubscription");
    $connection->Send($presence->Reply(type => 'unsubscribed'));
    $connection->Send($presence->Reply(type => 'unsubscribe'));
    delete $present{$jid};
  }

  # User has disconnected
  if ($type eq "unavailable") {
    log3("$jid unavailable");
    delete $present{$jid};
  }

  # Default presence information (type is blank)
  $present{$jid} = 1 if $type eq "";
}

sub log1 {
  # WARN
  my $msg = shift;
  return unless VERBOSE >= 1;
  print STDERR "WARN: $msg\n";
}

sub log2 {
  # INFO
  my $msg = shift;
  return unless VERBOSE >= 2;
  print STDERR "INFO: $msg\n";
}

sub log3 {
  # DBUG
  my $msg = shift;
  return unless VERBOSE >= 3;
  print STDERR "DBUG: $msg\n";
}

-- end --

-- jared at wordzoo.com

"Come, let us retract the foreskin of misconception and apply the wire
 brush of enlightenment." -- Geoff Miller




More information about the JDev mailing list