#!/usr/bin/perl -w # # dbjoin # Copyright (C) 1991-1998 by John Heidemann # $Id: dbjoin,v 1.22 2006/04/21 18:15:45 johnh Exp $ # # This program is distributed under terms of the GNU general # public license, version 2. See the file COPYING # in $dblibdir for details. # sub usage { print <getopt) { $ch = $dbopts->opt; if ($ch eq 'S') { $sorting_required = 0; } elsif ($ch eq 'd') { $debug++; } elsif ($ch eq 'i') { $non_match_inclusion++; } elsif ($ch eq 'e') { $null_value = $dbopts->optarg; } else { &usage; }; }; &usage if ($#ARGV < 2); my($aname, $bname) = @ARGV; shift; shift; my($join_columns) = join(" ", @ARGV); if ($sorting_required) { open(A, "cat $aname | $dbbindir/dbsort $join_columns |") || die("$prog: cannot run dbsort over ``$aname''.\n"); } else { open(A, "<$aname") || die("$prog: cannot open ``$aname''.\n"); }; my($aheader, @arawdata, @acomments, @adata, $a_fs_code, @acoloptions, @acolnames, @asubcolnames, %acolnametonum); $aheader = ; @arawdata = ; @acomments = grep(/^#/, @arawdata); @adata = grep(!/^#/, @arawdata); chomp(@adata); undef @arawdata; close (A); &process_header($aheader); $a_fs_code = $fs_code; @acoloptions = @coloptions; @acolnames = @colnames; @asubcolnames = @acolnames; %acolnametonum = %colnametonum; @colnames = (); %colnametonum = (); if ($sorting_required) { open(B, "cat $bname | $dbbindir/dbsort $join_columns |") || die("$prog: cannot run dbsort over ``$bname''.\n"); } else { open(B, "<$bname") || die("$prog: cannot open ``$bname''.\n"); }; my($bheader, @brawdata, @bcomments, @bdata, $b_fs_code, @bcoloptions, @bcolnames, @bsubcolnames, %bcolnametonum); $bheader = ; @brawdata = ; @bcomments = grep(/^#/, @brawdata); @bdata = grep(!/^#/, @brawdata); chomp(@bdata); undef @brawdata; close (B); &process_header($bheader); $b_fs_code = $fs_code; @bcoloptions = @coloptions; @bcolnames = @colnames; @bsubcolnames = @bcolnames; %bcolnametonum = %colnametonum; # reset @colnames = (); %colnametonum = (); die "$prog: cannot handle input data with different field separators.\n" if ($a_fs_code ne $b_fs_code); @coloptions = @acoloptions; # NEEDSWORk # # figure the joined columns # $i = 0; my %join_keys; for $key (@ARGV) { next if ($key =~ /^-/); # we deal with this later die("$prog: column ``$key'' is not in table ``$aname''.\n") if (!defined($acolnametonum{$key})); die("$prog: column ``$key'' is not in table ``$bname''.\n") if (!defined($bcolnametonum{$key})); $join_keys{$key} = $i; $colnames[$i] = $key; &col_mapping ($key, $i); $coltoa[$i] = $acolnametonum{$key}; $coltob[$i] = $bcolnametonum{$key}; $colina[$i] = 1; $colinb[$i] = 1; $i++; } # # and the rest # foreach $key (@acolnames) { next if (defined($colnametonum{$key})); $colnames[$i] = $key; &col_mapping ($key, $i); $coltoa[$i] = $acolnametonum{$key}; $colina[$i] = 1; $i++; } foreach $key (@bcolnames) { # detect duplicates that are not joined upon (error) # (this represents duplicate fieds in the two merged things). # Reject this because we don't want to silently prefer one to the other. if (defined($colnametonum{$key}) && !defined($join_keys{$key})) { die "$prog: column $key is in both of the joined files, but is not joined upon.\nAll non-joined columns must be unique.\nRename one of the source columns,\nor remove one of the duplicate input columns.\n"; }; next if (defined($colnametonum{$key})); $colnames[$i] = $key; &col_mapping ($key, $i); $coltob[$i] = $bcolnametonum{$key}; $colinb[$i] = 1; $i++; } # # Figure the comparison code. # sub join_row_col_ab_fn { my($row, $colname, $n) = @_; return '$' . $row . 'f[' . ($row eq 'a' ? $coltoa[$n] : $coltob[$n]) . ']'; # ' } my($ab_compare_code) = &generate_compare_code('ab_custom_compare', 'join_row_col_ab_fn', @ARGV); sub join_row_col_aolda_fn { my($row, $colname, $n) = @_; return '$' . ($row eq 'a' ? 'a' : 'olda') . 'f[' . $coltoa[$n] . ']'; # ' } my($aolda_compare_code) = &generate_compare_code('aolda_custom_compare', 'join_row_col_aolda_fn', @ARGV); sub join_row_col_boldb_fn { my($row, $colname, $n) = @_; return '$' . ($row eq 'a' ? 'b' : 'oldb') . 'f[' . $coltob[$n] . ']'; # ' } my($boldb_compare_code) = &generate_compare_code('boldb_custom_compare', 'join_row_col_boldb_fn', @ARGV); &write_header(); # # join the data (assumes data already sorted) # my($oldai, $oldbi, $ai, $bi, $firstmatchingbi, $lastmatchingbi, $inmatch); $oldai = $oldbi = -1; # keep track of the last entry to check for sortedness $ai = $bi = 0; $firstmatchingbi = $lastmatchingbi = -1; # $firstmatchingbi keeps track of the head of a run of bi's that match # $lastmatchingbi is different, it's the highestbi that's ever matched $inmatch = 0; sub save_ai { my($newval) = @_; @oldaf = @af; $oldai = $ai; $ai = $newval; @af = ($ai > $#adata ? () : split(/$fsre/, $adata[$ai])); } sub save_bi { my($newval) = @_; @oldbf = @bf; $oldbi = $bi; $bi = $newval; @bf = ($bi > $#bdata ? () : split(/$fsre/, $bdata[$bi])); } # loop through all a &save_ai(0); &save_bi(0); while ($ai <= $#adata) { # # Get the two rows for comparision. # if ($bi > $#bdata) { $result = -1; } else { # init sort checking if ($oldai == -1) { @oldaf = @af; $oldai = $ai; }; if ($oldbi == -1) { @oldbf = @bf; $oldbi = $bi; }; # check for sortedness print "# $ai($oldai) $bi($oldbi)\n" if ($debug); if ($oldai < $ai) { # if (!defined($af[0])) { # warn "\$af[0] undef\n"; # }; # if (!defined($oldaf[0])) { # warn "\$oldaf[0] undef\n"; # }; die("$prog: table ``$aname'' is out of order (between data lines $oldai and $ai).\n") if (&aolda_custom_compare() < 0); }; if ($oldbi < $bi) { die("$prog: table ``$bname'' is out of order (between data lines $oldbi and $bi).\n") if (&boldb_custom_compare() < 0); }; $result = &ab_custom_compare(); # Old way: # $result2 = &compare(); # print "$result\t$result2\n"; }; if ($result != 0 && $inmatch) { $inmatch = 0; # $ai has matched before, so don't need to check non-match inclusion &save_ai($ai+1); &save_bi($firstmatchingbi); next; }; if ($result < 0) { if ($non_match_inclusion) { &generate_a_only_data(); print "# a-only\n" if ($debug); &print_joined_data(); }; &save_ai($ai+1); } elsif ($result > 0) { if ($non_match_inclusion && $bi > $lastmatchingbi) { &generate_b_only_data(); print "# b-only\n" if ($debug); &print_joined_data(); }; &save_bi($bi+1); } else { if (!$inmatch) { $inmatch = 1; $firstmatchingbi = $bi; $lastmatchingbi = $bi if ($bi > $lastmatchingbi); } else { $lastmatchingbi = $bi if ($bi > $lastmatchingbi); }; &generate_joined_data(); &print_joined_data(); &save_bi($bi+1); }; }; # # When we're done, there could be b's left. # if ($non_match_inclusion) { while ($bi <= $#bdata) { if ($bi > $lastmatchingbi) { &generate_b_only_data(); print "# post-scan b-only\n" if ($debug); &print_joined_data(); }; $bi++; &save_bi($bi); }; }; print "# $aname COMMENTS:\n" . join("", @acomments) . "# $bname COMMENTS:\n" . join("", @bcomments) . "# joined comments:\n"; print "# | $prog ", join(" ", @orig_argv), "\n"; exit 0; # for compiler warnings my($x) = $outfs; $x = $dbopts->optarg; sub generate_joined_data { my($ci); @c = (); foreach $ci (0..$#colnames) { if ($colina[$ci]) { push (@c, $af[$coltoa[$ci]]); } else { push (@c, $bf[$coltob[$ci]]); }; }; } sub generate_a_only_data { my($ci); @c = (); foreach $ci (0..$#colnames) { if ($colina[$ci]) { push (@c, $af[$coltoa[$ci]]); } else { die ("$prog: need empty value, specify with option -e Value.\n") if (!defined($null_value) && $non_match_inclusion); push (@c, $null_value); }; }; } sub generate_b_only_data { my($ci); @c = (); foreach $ci (0..$#colnames) { if ($colinb[$ci]) { push (@c, $bf[$coltob[$ci]]); } else { die ("$prog: need empty value, specify with -e Value.\n") if (!defined($null_value) && $non_match_inclusion); push (@c, $null_value); }; }; } sub print_joined_data { print join($outfs, @c) . "\n"; } sub compare { my($result); my($i) = 0; foreach (@ARGV) { next if (/^-/); $result = $af[$coltoa[$i]] cmp $bf[$coltob[$i]]; return $result if ($result != 0); $i++; }; return 0; }