#!/usr/bin/perl -w # This code is a part of Slash, and is released under the GPL. # Copyright 1997-2001 by Open Source Development Network. See README # and COPYING for more information, or see http://slashcode.com/. # $Id: slashcode-dearchive,v 1.2.2.3 2001/08/06 16:18:15 cliff Exp $ use strict; use File::Basename; use FindBin '$Bin'; use Getopt::Std; use Slash::DB; use Slash::DB::Utility; use Date::Manip; (my $VERSION) = ' $Revision: 1.2.2.3 $ ' =~ /\$Revision:\s+([^\s]+)/; my $PROGNAME = basename($0); (my $PREFIX = $Bin) =~ s|/[^/]+/?$||; # REGULAR EXPRESSION DEFINITIONS # Enclosure defs. my $enclosure_regexps = { "Slash v0.9.2+" => { regexp => '(.+?)', extractors => '*', id_extractors => '*' }, 'Slashdot Jan/00+' => { regexp => '(?:.+?)?(?:|) ?(<(?i:A NAME).+?.+?) ? ', extractors => '*', id_extractors => '*', }, }; # Extractor defs. my %extractor_regexps = ( 'Slash v0.9.2+' => '(.+?)(.+?).+?by (.+?) on (.+?)\(.+?\)
.+?(?:\(.+?User Info\).+?(?:(.+?)
)?)?.+?.+?.+?(.+?)', 'Slashdot Jan/00+' => '^<(?i:A NAME)="([^>]+?)">(.+?)?(.+?)
by (.+?) ?on ((?:Mon|Tue|Wed|Thur|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).+?)
?(?:\(.+?User (?:#\d+ )?Info\) ?(?i:(.+?)
*)?)? ?(.+)
', #'^(.+?)(.+?)
by (.+?) on (.+?)
(?:\(.+?User Info\) (?:(.+?)
)?)? (.+?)
(.+?)? (?: |
  • .+)?$' ); # SID extration defs. my %idextractor_regexps = ( 'Slash v0.9.2+' => '', ); # Think about having regular expressions that can be applied to extracted # stories to look for parent information. # # - Cliff 6/6/01 my %opts; # Remember to doublecheck these match usage()! usage('Options used incorrectly') unless getopts('u:I:f:y:E:N:CDV?hv', \%opts); usage() if ($opts{'h'} || !keys %opts); usage("Must specify both -f and -u!") if !$opts{f} || !$opts{u}; version() if $opts{'v'}; my($virtuser, $year, $infile, $enclosure, $extractor, $sid, $ovrw, $confirm, $debug)= @opts{qw[u y f N E I V C D]}; $virtuser||= 'slash'; $year ||= UnixDate('now', '%Y'); # Initialize Slash site database object. my $slashdb = new Slash::DB($virtuser) || die "*** Can't open database on Virtual User $virtuser.\n"; # Set our timezone and initialize the date routines. $ENV{TZ} = 'EST5EDT'; &Date_Init(); # Globals. my(%data); my $ac_uid = $slashdb->getVar('anonymous_coward_uid', 'value'); my $ac_nickname = $slashdb->getUser($ac_uid, 'nickname'); # We slurp in everything as one big file. This will NOT be pleasant on machines # with low amounts of memory (anything < 32M; quite a few Slashdot .shtml files # can amass well over one meg in size and that's for a SINGLE scalar. my($bigscalar, $test_xdata, $errors); die "Can't open INFILE '$infile'\n" if ! -f $infile; do { local $/ = undef; open(INPUT, "<$infile"); $bigscalar = ; close(INPUT); # Remove newlines, they only get in the way. $bigscalar =~ s/\n/ /g; }; # Determine which enclosure and extractor regular expressions to use for the given # file; note that choices for enclosures and/or extractors can be forced by command # line options. if (!$enclosure || !$extractor) { my(@enclosures); push (@enclosures, ($enclosure) ? $enclosure : sort keys %{$enclosure_regexps}); $enclosure=''; for (@enclosures) { printf STDERR "%s ENCLOSURE '$_'\n", ($enclosure) ? 'Using':'Trying' if $debug; $enclosure = $_ if $bigscalar =~ /$enclosure_regexps->{$_}->{regexp}/s; next if ! $enclosure; if (! $extractor) { $test_xdata = $1; my @extractors; if (ref $enclosure_regexps->{$_}->{extractors} eq 'ARRAY') { push @extractors, @{$enclosure_regexps->{$_}->{extractors}}; } elsif ($enclosure_regexps->{$_}->{extractors} eq '*') { push @extractors, sort keys %extractor_regexps; } for (@extractors) { print STDERR "\tTrying EXTRACTOR '$_'\n" if $debug; if ($test_xdata =~ /$extractor_regexps{$_}/s) { $extractor = $_; last; } } } last if $enclosure && $extractor; } } die "$infile: Can't determine enclosure/extractor pair!\n" if !$enclosure && !$extractor; die <sqlSelect('*', 'comments', 'sid=' . quote($sid))) { if (!$ovrw) { print "Data already exists for article '$sid'!!\n"; exit 1; } else { print "Deleting all existing comments from article '$sid'\n"; $slashdb->sqlDo('DELETE FROM comments WHERE sid=' . quote($sid)); } } print "Beginning extraction for article '$sid'...\n"; if ($confirm) { print "\n\n\n\n"; my $input = ; } my($en_regexp, $ex_regexp) = ($enclosure_regexps->{$enclosure}->{regexp}, $extractor_regexps{$extractor}); $errors = 0; pos $bigscalar = 0; while ($bigscalar =~ /$en_regexp/sg) { my($uid, $cid, $email_addy, $uid_s, $x_data, $wait, $result, $resultpos); my($subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig); my($gmt_time, $score, $posttime, $uname); my(%uid_cache, $gmt_ts, $rc); $x_data = $1; $result = 'Unextracted '; $resultpos = ''; if ($x_data =~ /$ex_regexp/s) { # This is ugly... ($cid, $subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); next if !$cid; printf STDERR "EXTRACTED: %d, %d, %d, %d, %d, %d, %d, %d, %d\n", map { ($_) ? $_ : ''; 1 } ($cid, $subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig) if $debug; # Copy $username. $uname = $username; # Extract and remove email information. if ($uname =~ s/\((.+?)\)<\/FONT><\/B>//) { $email_addy = $1; } # Fix $uname if it is a link. $uname =~ s/<(?i:A HREF)=".*?">(.+?)<\/A>.+$/$1/s; # Remove any and all backtics (`) from the user name since this breaks # the API for some reason. $uname =~ s/\`//g; # Grab score from indicator. if ($score_ind =~ /Score:(\d+)/) { $score = $1 if defined $1; } # Strip signature of and tags (these are part of display # code) $sig =~ s!(?:|)!!g if $sig; ($rc, $result) = ('', ''); $uid_s = defined $email_addy ? " <$email_addy>" : ''; # Use database to see if we can match the nickname to a UID. if ($uname && $uname ne $ac_nickname) { # Check user id cache for hit on the parsed username. $uid = 0; if ($uid_cache{$uname}) { $uid = $uid_cache{$uname}; } else { $uid = $slashdb->getUserUID($uname); $uid_cache{$uname} = $uid if $uid; } $uid_s .= " ($uid)" if $uid; } else { $uid = $ac_uid if $uname eq $ac_nickname; } # Begin clean up routines....these routines are REAL site specific # so someone should look in to extending the data structures above # to make it less so. I may if I actually get some time to play # around with this code a bit more, but that's extremely low # priority right now. Maybe come around 2.2 or 2.4 release time. # # - Cliff 3/22/2001 # Convert date to internal representation and then adjust for GMT # time (since all times should be stored as GMT in dB). if ($ts) { # Some versions of dispComment have the a link to the comment # right after the date. This should be removed. $ts =~ s/\(\)//i; # Get rid of the extraneous commas and '@' symbols which may # interfere with parsing. $ts =~ s/,\s@/ /g; # Date::Manip balks if DoW is present in a datespec which specifies # no year value and does NOT represent a date within the current # year. So the simple solution is to strip the DoW which appears # at the beginning of the string. This really depends on the site # but this is the default for Slashcode and also works on Slashdot. $ts =~ s/(?:Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day\s+//; # Now we must INSURE that the proper year is set. Note that Date::Manip # will use the CURRENT YEAR if it isn't specified in the string, so # we'll have to carefully insert the year into the timestamp. # # The following assumes that $ts will always be of the form: #