#!/usr/local/bin/perl # # $Id: htdig-dump,v 1.3 2001/01/09 12:03:54 cmdjb Exp $ # # Dump ht://dig docs database # # Copyright (C) 1997-2001 Dave Beckett - http://purl.org/net/dajobe/ # # USAGE: htdig-dump # # and the SOIF records are printed to stdout # require 5.004; use strict; use GDBM_File; use File::Basename; # Local modules use Metadata::SOIF; $::VERSION=(split(/ /, q$Id: htdig-dump,v 1.3 2001/01/09 12:03:54 cmdjb Exp $))[2]; $::DEBUG=0; $::prog_name=basename $0; # CONSTANTS @::DOC_REF_TAGS=( [qw(number ID)], [qw(number Time)], [qw(number Accessed)], [qw(number State)], [qw(number Size)], [qw(number Links)], [qw(number ImageSize)], [qw(number HopCount)], [qw(string URL)], [qw(string Head)], [qw(string Title)], [qw(list Descriptions)], [qw(list Anchors)], [qw(string Email)], [qw(string Notification)], [qw(string Subject)], [qw(number DOCSTRING)], # Not used ); %::HTDIG_TO_SOIF=( ID => 'HTDIG-Object-Identifier', Time => 'Last-Modification-Time', Accessed => 'HTDIG-Last-Accessed-Time', State => 'HTDIG-State', Size => 'File-Size', Links => 'HTDIG-Links', ImageSize => 'HTDIG-ImageSize', HopCount => 'HTDIG-HopCount', URL => 'URL', # special, goes to URL field later Head => 'Body', Title => 'Title', Descriptions => 'Abstract', Anchors => 'HTDIG-Anchors', Email => 'Author', Notification => 'HTDIG-Notification', Subject => 'Keywords', ); &main(@ARGV); exit 0; sub main (@) { die "USAGE: $::prog_name: \n" unless @_; my $file=shift; my $mode=&GDBM_READER(); die "$::prog_name: No such htdig docs database file $file\n" unless -r $file; warn "$::prog_name: Version $::VERSION reading $file\n"; my(%db); tie %db, 'GDBM_File', $file, $mode, 0644 or die "$::prog_name: Could not tie GDBM_File $file\n"; my $count=0; while(my($key,$data)=each %db) { my $soif=new Metadata::SOIF; deserialise_htdig_doc_ref($soif, $data); convert_htdig_metadata($soif); print $soif->as_string,"\n"; last if $count++>100; } untie %db; } sub deserialise_htdig_doc_ref ($$) { my($md,$data)=@_; $md->clear; my $sizeof_int=length(pack('i',0)); my $d=$data; $d=~ tr/[ -~]/./cd; warn "Data is '$d' (", length($data), " bytes)\n" if $::DEBUG>3; my $offset=0; while($offset <= length($data)) { my $tag=unpack('C', substr($data,$offset,1)); last if !defined $tag; die "$::prog_name: Unknown tag $tag found\n" unless $::DOC_REF_TAGS[$tag]; my($type,$name)=@{$::DOC_REF_TAGS[$tag]}; my $soif_name=$::HTDIG_TO_SOIF{$name}; warn "$offset: Tag $tag - Type $type Name $name\n" if $::DEBUG; $offset++; if ($type eq 'number') { # Original C: #define getnum(in, var) memcpy((char *) &var, in, sizeof(var)); in += sizeof(var) my $num=unpack('i', substr($data,$offset,$sizeof_int)); warn "$offset: Number $num\n" if $::DEBUG; $offset+=$sizeof_int; $md->set($soif_name,$num); } elsif ($type eq 'string') { # Original C: #define getstring(in, str) getnum(in, length); str = 0; str.append(in, length); in += length my $length=unpack('i', substr($data,$offset,$sizeof_int)); $offset+=$sizeof_int; my $string=substr($data,$offset,$length); my $qs=$string; $qs=~ s/([^ -~])/sprintf("\\x%02X",ord($1))/ge; warn "$offset: String '$qs'\n" if $::DEBUG; $offset+=$length; $md->set($soif_name,$string); } else { # list # Original C: #define getlist(in, list) getnum(in, count); for (i = 0; i < count; i++) { getnum(in, length); str = new String; str->append(in, length); list.Add(str); in += length; } my $count=unpack('i', substr($data,$offset,$sizeof_int)); $offset+=$sizeof_int; my(@list); for my $i (1..$count) { my $length=unpack('i', substr($data,$offset,$sizeof_int)); $offset+=$sizeof_int; my $string=substr($data,$offset,$length); warn "$offset: List String #$i '$string'\n" if $::DEBUG; $offset+=$length; push(@list, $string); } $md->set($soif_name,\@list); } } } sub convert_htdig_metadata ($) { my($soif)=shift; my $url=$soif->get('URL'); $soif->delete('URL'); $soif->url($url); my $title; my $abstract; if(!($title=$soif->get('Title')) && ($abstract=$soif->get('Abstract')) && length($abstract)<100) { $soif->set('Title', $title=$abstract); $soif->delete('Abstract'); $abstract=undef; } if ($title && $abstract) { $title=~s/\s+$//; $abstract=~s/\s+$//; if ($title eq $abstract) { $soif->delete('Abstract'); $abstract=undef; } } my $type='HTML'; $type='Postscript' if $url =~ /\.e?ps$/i; $soif->set('Type', $type); #my $b=$soif->get('Body'); #my $digest=MD5->hexhash($b); #$soif->set('MD5', $digest); } __END__