#!/usr/bin/perl -w # This code is a part of Slash, and is released under the GPL. # Copyright 1997-2001 by Open Source Development Network. See README # and COPYING for more information, or see http://slashcode.com/. # $Id: slashcode-dearchive,v 1.2.2.3 2001/08/06 16:18:15 cliff Exp $ use strict; use File::Basename; use FindBin '$Bin'; use Getopt::Std; use Slash::DB; use Slash::DB::Utility; use Date::Manip; (my $VERSION) = ' $Revision: 1.2.2.3 $ ' =~ /\$Revision:\s+([^\s]+)/; my $PROGNAME = basename($0); (my $PREFIX = $Bin) =~ s|/[^/]+/?$||; # REGULAR EXPRESSION DEFINITIONS # Enclosure defs. my $enclosure_regexps = { "Slash v0.9.2+" => { regexp => '(.+?)', extractors => '*', id_extractors => '*' }, 'Slashdot Jan/00+' => { regexp => '(?:.+?)?(?:|) ?(<(?i:A NAME).+?.+?) ? ', extractors => '*', id_extractors => '*', }, }; # Extractor defs. my %extractor_regexps = ( 'Slash v0.9.2+' => '(.+?)(.+?).+?by (.+?) on (.+?)$.+?$
.+?(?:$.+?User Info$.+?(?:(.+?)
)?)?.+?.+?.+?(.+?)', 'Slashdot Jan/00+' => '^<(?i:A NAME)="([^>]+?)">(.+?)?(.+?)
by (.+?) ?on ((?:Mon|Tue|Wed|Thur|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).+?)
?(?:$.+?User (?:#\d+ )?Info$ ?(?i:(.+?)
*)?)? ?(.+)
', #'^(.+?)(.+?)
by (.+?) on (.+?)
(?:$.+?User Info$ (?:(.+?)
)?)? (.+?)
(.+?)? (?: |

.+)?$' ); # SID extration defs. my %idextractor_regexps = ( 'Slash v0.9.2+' => '', ); # Think about having regular expressions that can be applied to extracted # stories to look for parent information. # # - Cliff 6/6/01 my %opts; # Remember to doublecheck these match usage()! usage('Options used incorrectly') unless getopts('u:I:f:y:E:N:CDV?hv', \%opts); usage() if ($opts{'h'} || !keys %opts); usage("Must specify both -f and -u!") if !$opts{f} || !$opts{u}; version() if $opts{'v'}; my($virtuser, $year, $infile, $enclosure, $extractor, $sid, $ovrw, $confirm, $debug)= @opts{qw[u y f N E I V C D]}; $virtuser||= 'slash'; $year ||= UnixDate('now', '%Y'); # Initialize Slash site database object. my $slashdb = new Slash::DB($virtuser) || die "*** Can't open database on Virtual User $virtuser.\n"; # Set our timezone and initialize the date routines. $ENV{TZ} = 'EST5EDT'; &Date_Init(); # Globals. my(%data); my $ac_uid = $slashdb->getVar('anonymous_coward_uid', 'value'); my $ac_nickname = $slashdb->getUser($ac_uid, 'nickname'); # We slurp in everything as one big file. This will NOT be pleasant on machines # with low amounts of memory (anything < 32M; quite a few Slashdot .shtml files # can amass well over one meg in size and that's for a SINGLE scalar. my($bigscalar, $test_xdata, $errors); die "Can't open INFILE '$infile'\n" if ! -f $infile; do { local $/ = undef; open(INPUT, "<$infile"); $bigscalar = ; close(INPUT); # Remove newlines, they only get in the way. $bigscalar =~ s/\n/ /g; }; # Determine which enclosure and extractor regular expressions to use for the given # file; note that choices for enclosures and/or extractors can be forced by command # line options. if (!$enclosure || !$extractor) { my(@enclosures); push (@enclosures, ($enclosure) ? $enclosure : sort keys %{$enclosure_regexps}); $enclosure=''; for (@enclosures) { printf STDERR "%s ENCLOSURE '$_'\n", ($enclosure) ? 'Using':'Trying' if $debug; $enclosure = $_ if $bigscalar =~ /$enclosure_regexps->{$_}->{regexp}/s; next if ! $enclosure; if (! $extractor) { $test_xdata = $1; my @extractors; if (ref $enclosure_regexps->{$_}->{extractors} eq 'ARRAY') { push @extractors, @{$enclosure_regexps->{$_}->{extractors}}; } elsif ($enclosure_regexps->{$_}->{extractors} eq '*') { push @extractors, sort keys %extractor_regexps; } for (@extractors) { print STDERR "\tTrying EXTRACTOR '$_'\n" if $debug; if ($test_xdata =~ /$extractor_regexps{$_}/s) { $extractor = $_; last; } } } last if $enclosure && $extractor; } } die "$infile: Can't determine enclosure/extractor pair!\n" if !$enclosure && !$extractor; die <sqlSelect('*', 'comments', 'sid=' . quote($sid))) { if (!$ovrw) { print "Data already exists for article '$sid'!!\n"; exit 1; } else { print "Deleting all existing comments from article '$sid'\n"; $slashdb->sqlDo('DELETE FROM comments WHERE sid=' . quote($sid)); } } print "Beginning extraction for article '$sid'...\n"; if ($confirm) { print "\n\n\n\n"; my $input = ; } my($en_regexp, $ex_regexp) = ($enclosure_regexps->{$enclosure}->{regexp}, $extractor_regexps{$extractor}); $errors = 0; pos $bigscalar = 0; while ($bigscalar =~ /$en_regexp/sg) { my($uid, $cid, $email_addy, $uid_s, $x_data, $wait, $result, $resultpos); my($subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig); my($gmt_time, $score, $posttime, $uname); my(%uid_cache, $gmt_ts, $rc); $x_data = $1; $result = 'Unextracted '; $resultpos = ''; if ($x_data =~ /$ex_regexp/s) { # This is ugly... ($cid, $subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig) = ($1, $2, $3, $4, $5, $6, $7, $8, $9); next if !$cid; printf STDERR "EXTRACTED: %d, %d, %d, %d, %d, %d, %d, %d, %d\n", map { ($_) ? $_ : ''; 1 } ($cid, $subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig) if $debug; # Copy $username. $uname = $username; # Extract and remove email information. if ($uname =~ s/$(.+?)$<\/FONT><\/B>//) { $email_addy = $1; } # Fix $uname if it is a link. $uname =~ s/<(?i:A HREF)=".*?">(.+?)<\/A>.+$/$1/s; # Remove any and all backtics (`) from the user name since this breaks # the API for some reason. $uname =~ s/\`//g; # Grab score from indicator. if ($score_ind =~ /Score:(\d+)/) { $score = $1 if defined $1; } # Strip signature of and tags (these are part of display # code) $sig =~ s!(?:|)!!g if $sig; ($rc, $result) = ('', ''); $uid_s = defined $email_addy ? " <$email_addy>" : ''; # Use database to see if we can match the nickname to a UID. if ($uname && $uname ne $ac_nickname) { # Check user id cache for hit on the parsed username. $uid = 0; if ($uid_cache{$uname}) { $uid = $uid_cache{$uname}; } else { $uid = $slashdb->getUserUID($uname); $uid_cache{$uname} = $uid if $uid; } $uid_s .= " ($uid)" if $uid; } else { $uid = $ac_uid if $uname eq $ac_nickname; } # Begin clean up routines....these routines are REAL site specific # so someone should look in to extending the data structures above # to make it less so. I may if I actually get some time to play # around with this code a bit more, but that's extremely low # priority right now. Maybe come around 2.2 or 2.4 release time. # # - Cliff 3/22/2001 # Convert date to internal representation and then adjust for GMT # time (since all times should be stored as GMT in dB). if ($ts) { # Some versions of dispComment have the a link to the comment # right after the date. This should be removed. $ts =~ s///i; # Get rid of the extraneous commas and '@' symbols which may # interfere with parsing. $ts =~ s/,\s@/ /g; # Date::Manip balks if DoW is present in a datespec which specifies # no year value and does NOT represent a date within the current # year. So the simple solution is to strip the DoW which appears # at the beginning of the string. This really depends on the site # but this is the default for Slashcode and also works on Slashdot. $ts =~ s/(?:Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day\s+//; # Now we must INSURE that the proper year is set. Note that Date::Manip # will use the CURRENT YEAR if it isn't specified in the string, so # we'll have to carefully insert the year into the timestamp. # # The following assumes that $ts will always be of the form: # $ts =~ s/(\w+ \d+)/$1, $year/; # Now grab the zone associated with the date. $ts =~ /\s([^\ ]+?)$/; # Need $oldPos becuase Date::Manip routines mess with $_. my($oldPos) = pos $_; my($tsdate, $tszone) = (ParseDate($ts), $1); # Insure date is set to the correct year based on parameters. # Could manually set this by SID, but SID isn't really # Y2K compliant... $tsdate = Date_SetDateField($tsdate, 'y', $year); if ($tsdate) { $gmt_ts = Date_ConvTZ($tsdate, $tszone, 'GMT'); $ts .= ' (' . UnixDate($gmt_ts, "%A %B %d, %Y \@%r GMT") . ')'; } pos $_ = $oldPos; } # Attribute comment to the proper nickname in the comment if the # user doesn't turn up in the database. Be sure to use the # sanitized user name since $username may contain characters that # will confuse DBI. $comment = "Posted by $uname:

$comment" if (! $uid); print "\n", header("Extracted ${\(length($x_data))} bytes"), ($cid ? "#$cid\n" : ""), ($subj ? "Subj: $subj\n": ""), ($uname ? "User: $uname$uid_s\n" : ""), ($ts ? "Date: $ts\n" : ""), ($url_name ? "URL: $url_name " : ""), ($url ? "($url)\n" : ($url_name ? "\n" : "")), ($score ? "Score: $score" : ""), ($comment ? "\n--> $comment\n\n" : "\n"), ($sig ? "\nSig: $sig\n\n" : "\n") if $debug; if ($confirm) { print ""; my $input = ; } # Assign data to hash. $data{$cid}->{subject} = $subj || ''; $data{$cid}->{'-uid'} = $uid || $ac_uid; $data{$cid}->{date} = $gmt_ts; $data{$cid}->{comment} = $comment; $data{$cid}->{'-points'} = ($score) ? $score : 0; } print header("${result}X-Data$resultpos") . "$x_data\n" if $debug; if ($result eq 'Unextracted ' || $rc || $comment =~ /<\/?TD>/ || $comment =~ /<\/?TR>/) { if ($debug) { print '--- Check data ---'; $wait = ; } $errors++; } } # Unload massive scalar (would setting $bigscalar to undef be a bad thing here?) $bigscalar = ''; # Now write data to database. print "Writing to database..."; my $nr = 0; foreach (keys(%data)) { # We process comments into the database by CID and let it worry about # all of the sorting. # Copy and sanitize data for insert into the comments table. my(%sqldata) = %{$data{$_}}; $sqldata{sid} = $sid; $sqldata{'-cid'} = $_; # Until nested mode support can be coded in, Parent ID data will be lost. $sqldata{'-pid'} = 0; # Since host_name information is not included in the .shtml, we lose that # information as well. $sqldata{host_name} = ''; # This data isn't as important. It would be a pain in the ass to go thru # and collect the moderation data for proper insertion, so it's just # easier to leave it out for now. $sqldata{'-lastmod'} = -1; $sqldata{'-reason'} = 0; $sqldata{'date'} = UnixDate($data{$_}->{date}, '%Y-%m-%d %H:%M'); # Can $slashdb return meaningful errors if one of its methods fails? # # Use createComments(), here...especially since the comment system # is changing and these details should be behind the API. if ($slashdb->sqlInsert('comments', \%sqldata)) { $nr++; } else { print "\nError occured while inserting comment #$_!\n"; } } print "\n$nr records written to COMMENTS for SID '$sid'\n"; print "*** $errors errors occured during processing.\n" if $errors; 0; sub header { my($text) = shift; "$text\n${\('-' x length($text))}\n"; } # It would be nice if the Slash DB API would provide this. sub quote { my($text) = shift; $text =~ s/([\W\D])/\\$1/g; $text = "'$text'"; $text; } sub usage { print "*** $_[0]\n" if $_[0]; my (@enlist) = sort keys %{$enclosure_regexps}; my (@exlist) = sort keys %extractor_regexps; my (@idexlist) = sort keys %idextractor_regexps; local $" = "\t\t"; printf < -f -I -y -N -E Command line switches: -C Confirms all options and provides prompts during processing operations. Used primarily for debugging. -V Overwrite any comments for the given discussion ID. -D Debug mode. Notes: If -I is not given, then the program will cycle thru all defined ID extractors in an attempt to find a key suitable for database insertion. If no ID can be found, then the program will quit with an error and the use of -I will be required. Defined comment enclosures: %s Defined comment extractors: %s Defined ID extractors: %s EOT exit; } sub version { print <