#!/usr/bin/perl -w
# This code is a part of Slash, and is released under the GPL.
# Copyright 1997-2001 by Open Source Development Network. See README
# and COPYING for more information, or see http://slashcode.com/.
# $Id: slashcode-dearchive,v 1.2.2.3 2001/08/06 16:18:15 cliff Exp $
use strict;
use File::Basename;
use FindBin '$Bin';
use Getopt::Std;
use Slash::DB;
use Slash::DB::Utility;
use Date::Manip;
(my $VERSION) = ' $Revision: 1.2.2.3 $ ' =~ /\$Revision:\s+([^\s]+)/;
my $PROGNAME = basename($0);
(my $PREFIX = $Bin) =~ s|/[^/]+/?$||;
# REGULAR EXPRESSION DEFINITIONS
# Enclosure defs.
my $enclosure_regexps = {
"Slash v0.9.2+" => {
regexp =>
'(.+?)',
extractors => '*',
id_extractors => '*'
},
'Slashdot Jan/00+' => {
regexp =>
'(?:
(.+) ',
#'^(.+?)(.+?) by (.+?) on (.+?) (?:\(.+?User Info\) (?:(.+?) )?)?
(.+?) (.+?)?
(?:
|
.+)?$'
);
# SID extration defs.
my %idextractor_regexps = (
'Slash v0.9.2+' =>
']+?)?article\.pl\?sid=([^>]+?)"?>',
);
# Think about having regular expressions that can be applied to extracted
# stories to look for parent information.
#
# - Cliff 6/6/01
my %opts;
# Remember to doublecheck these match usage()!
usage('Options used incorrectly')
unless getopts('u:I:f:y:E:N:CDV?hv', \%opts);
usage() if ($opts{'h'} || !keys %opts);
usage("Must specify both -f and -u!") if !$opts{f} || !$opts{u};
version() if $opts{'v'};
my($virtuser, $year, $infile, $enclosure, $extractor, $sid, $ovrw, $confirm, $debug)=
@opts{qw[u y f N E I V C D]};
$virtuser||= 'slash';
$year ||= UnixDate('now', '%Y');
# Initialize Slash site database object.
my $slashdb = new Slash::DB($virtuser) ||
die "*** Can't open database on Virtual User $virtuser.\n";
# Set our timezone and initialize the date routines.
$ENV{TZ} = 'EST5EDT';
&Date_Init();
# Globals.
my(%data);
my $ac_uid = $slashdb->getVar('anonymous_coward_uid', 'value');
my $ac_nickname = $slashdb->getUser($ac_uid, 'nickname');
# We slurp in everything as one big file. This will NOT be pleasant on machines
# with low amounts of memory (anything < 32M; quite a few Slashdot .shtml files
# can amass well over one meg in size and that's for a SINGLE scalar.
my($bigscalar, $test_xdata, $errors);
die "Can't open INFILE '$infile'\n" if ! -f $infile;
do {
local $/ = undef;
open(INPUT, "<$infile");
$bigscalar = ;
close(INPUT);
# Remove newlines, they only get in the way.
$bigscalar =~ s/\n/ /g;
};
# Determine which enclosure and extractor regular expressions to use for the given
# file; note that choices for enclosures and/or extractors can be forced by command
# line options.
if (!$enclosure || !$extractor) {
my(@enclosures);
push (@enclosures, ($enclosure) ? $enclosure : sort keys %{$enclosure_regexps});
$enclosure='';
for (@enclosures) {
printf STDERR "%s ENCLOSURE '$_'\n", ($enclosure) ? 'Using':'Trying'
if $debug;
$enclosure = $_ if $bigscalar =~ /$enclosure_regexps->{$_}->{regexp}/s;
next if ! $enclosure;
if (! $extractor) {
$test_xdata = $1;
my @extractors;
if (ref $enclosure_regexps->{$_}->{extractors} eq 'ARRAY') {
push @extractors, @{$enclosure_regexps->{$_}->{extractors}};
} elsif ($enclosure_regexps->{$_}->{extractors} eq '*') {
push @extractors, sort keys %extractor_regexps;
}
for (@extractors) {
print STDERR "\tTrying EXTRACTOR '$_'\n" if $debug;
if ($test_xdata =~ /$extractor_regexps{$_}/s) {
$extractor = $_;
last;
}
}
}
last if $enclosure && $extractor;
}
}
die "$infile: Can't determine enclosure/extractor pair!\n" if !$enclosure && !$extractor;
die <sqlSelect('*', 'comments', 'sid=' . quote($sid))) {
if (!$ovrw) {
print "Data already exists for article '$sid'!!\n";
exit 1;
} else {
print "Deleting all existing comments from article '$sid'\n";
$slashdb->sqlDo('DELETE FROM comments WHERE sid=' . quote($sid));
}
}
print "Beginning extraction for article '$sid'...\n";
if ($confirm) {
print "\n\n\n\n";
my $input = ;
}
my($en_regexp, $ex_regexp) = ($enclosure_regexps->{$enclosure}->{regexp},
$extractor_regexps{$extractor});
$errors = 0;
pos $bigscalar = 0;
while ($bigscalar =~ /$en_regexp/sg) {
my($uid, $cid, $email_addy, $uid_s, $x_data, $wait, $result, $resultpos);
my($subj, $score_ind, $username, $ts, $url, $url_name, $comment, $sig);
my($gmt_time, $score, $posttime, $uname);
my(%uid_cache, $gmt_ts, $rc);
$x_data = $1;
$result = 'Unextracted ';
$resultpos = '';
if ($x_data =~ /$ex_regexp/s) {
# This is ugly...
($cid,
$subj,
$score_ind,
$username,
$ts,
$url,
$url_name,
$comment,
$sig) = ($1, $2, $3, $4, $5, $6, $7, $8, $9);
next if !$cid;
printf STDERR
"EXTRACTED: %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
map { ($_) ? $_ : ''; 1 }
($cid, $subj, $score_ind, $username, $ts, $url,
$url_name, $comment, $sig)
if $debug;
# Copy $username.
$uname = $username;
# Extract and remove email information.
if ($uname =~ s/\((.+?)\)<\/FONT><\/B>//) {
$email_addy = $1;
}
# Fix $uname if it is a link.
$uname =~ s/<(?i:A HREF)=".*?">(.+?)<\/A>.+$/$1/s;
# Remove any and all backtics (`) from the user name since this breaks
# the API for some reason.
$uname =~ s/\`//g;
# Grab score from indicator.
if ($score_ind =~ /Score:(\d+)/) {
$score = $1 if defined $1;
}
# Strip signature of