#!/usr/local/bin/perl eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' if 0; # not running under some shell # This is mysql-slave-restart, a program to watch replication and try to # restart the slave on errors. # # This program is copyright (c) 2007 Baron Schwartz. Feedback and # improvements are welcome. # # THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, version 2; OR the Perl Artistic License. On UNIX and similar # systems, you can issue `man perlgpl' or `man perlartistic' to read these # licenses. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place, Suite 330, Boston, MA 02111-1307 USA. use strict; use warnings FATAL => 'all'; use DBI; use English qw(-no_match_vars); use Getopt::Long; use List::Util qw(min max); use Time::HiRes qw(sleep); use Term::ReadKey qw(ReadMode); our $VERSION = '1.0.1'; our $DISTRIB = '1053'; our $SVN_REV = sprintf("%d", q$Revision: 863 $ =~ m/(\d+)/g); # ############################################################################ # Get configuration information. # ############################################################################ # Define cmdline args. my @opt_spec = ( { s => 'askpass', d => 'Prompt for a password for the connection' }, { s => 'daemonize', d => 'Fork to background and detach (POSIX only)' }, { s => 'database|D=s', d => 'Database to use' }, { s => 'defaults-file|F=s', d => 'Only read default options from the given file' }, { s => 'error-numbers|e=s', d => 'Only restart this comma-separated list of errors' }, { s => 'error-text|E=s', d => 'Only restart errors that match this pattern' }, { s => 'error-length|L=i', d => 'Max length of error message to print' }, { s => 'help', d => 'Show this help message' }, { s => 'host|h=s', d => 'Connect to host' }, { s => 'maxsleep|M=f', d => 'Maximum sleep time (default 64 sec)'}, { s => 'minsleep|m=f', d => 'Minimum sleep time (default 1/64 sec)'}, { s => 'password|p=s', d => 'Password to use when connecting' }, { s => 'port|P=i', d => 'Port number to use for connection' }, { s => 'skipcount|k=i', d => 'Number of statements to skip (default 1)' }, { s => 'sleep|s=f', d => 'Initial sleep time (default 1 sec)' }, { s => 'socket|S=s', d => 'Socket file to use for connection' }, { s => 'time|t=s', d => 'Time to run before exiting (suffix: s/m/h/d)' }, { s => 'untilmaster=s', d => 'Run until this master log file and position' }, { s => 'untilrelay=s', d => 'Run until this relay log file and position' }, { s => 'user|u=s', d => 'User for login if not current user' }, { s => 'verbose|v+', d => 'Verbosity (specify multiple times for more detail)' }, { s => 'version', d => 'Output version information and exit' }, ); # This is the container for the command-line options' values to be stored in # after processing. Initial values are defaults. my %opts = ( k => 1, s => 1, m => 1/64, M => 64, v => 0, ); # Post-process... my %opt_seen; foreach my $spec ( @opt_spec ) { my ( $long, $short ) = $spec->{s} =~ m/^([\w-]+)(?:\|([^!+=]*))?/; $spec->{k} = $short || $long; $spec->{l} = $long; $spec->{t} = $short; $spec->{n} = $spec->{s} =~ m/!/; $opts{$spec->{k}} = undef unless defined $opts{$spec->{k}}; die "Duplicate option $spec->{k}" if $opt_seen{$spec->{k}}++; } Getopt::Long::Configure('no_ignore_case', 'bundling'); GetOptions( map { $_->{s} => \$opts{$_->{k}} } @opt_spec) or $opts{help} = 1; if ( $opts{version} ) { print "mysql-slave-restart Ver $VERSION Distrib $DISTRIB Changeset $SVN_REV\n"; exit(0); } if ( !$opts{help} ) { if ( $opts{t} ) { if ( $opts{t} !~ m/^\d+[smhd]?$/ ) { warn "Invalid --time argument\n"; $opts{help} = 1; } elsif ( $opts{t} =~ m/(\d+)([smhd])$/ ) { $opts{t} = $2 eq 's' ? $1 # Seconds : $2 eq 'm' ? $1 * 60 # Minutes : $2 eq 'h' ? $1 * 3600 # Hours : $1 * 86400; # Days } } if ( $opts{untilmaster} ) { if ( $opts{untilmaster} !~ m/^[.\w-]+,\d+$/ ) { warn "Invalid --untilmaster argument, must be file,pos\n"; $opts{help} = 1; } } if ( $opts{untilrelay} ) { if ( $opts{untilrelay} !~ m/^[.\w-]+,\d+$/ ) { warn "Invalid --untilrelay argument, must be file,pos\n"; $opts{help} = 1; } } } # Prepare the list of error numbers. if ( $opts{e} ) { $opts{e} = { map { $_ => 1 } $opts{e} =~ m/(\d+)/g }; } if ( $opts{help} ) { print "Usage: mysql-slave-restart \n\n"; my $maxw = max(map { length($_->{l}) + ($_->{n} ? 4 : 0)} @opt_spec); foreach my $spec ( sort { $a->{l} cmp $b->{l} } @opt_spec ) { my $long = $spec->{n} ? "[no]$spec->{l}" : $spec->{l}; my $short = $spec->{t} ? "-$spec->{t}" : ''; printf(" --%-${maxw}s %-4s %s\n", $long, $short, $spec->{d}); } print < 'mysql_read_default_file', h => 'host', P => 'port', S => 'mysql_socket' ); my $dsn = 'DBI:mysql:' . ( $opts{D} || '' ) . ';' . join(';', map { "$conn{$_}=$opts{$_}" } grep { defined $opts{$_} } qw(F h P S)) . ';mysql_read_default_group=mysql'; if ( $opts{askpass} ) { print "Enter password: "; ReadMode('noecho'); chomp ($opts{p} = ); ReadMode('normal'); print "\n"; } my $dbh = DBI->connect($dsn, @opts{qw(u p)}, { AutoCommit => 1, RaiseError => 1, PrintError => 0 } ); # VERY IMPORTANT: Lowercases all column names for fetchrow_hashref. This is # because different MySQL versions use different lettercase in SHOW SLAVE # STATUS. $dbh->{FetchHashKeyName} = 'NAME_lc'; $dbh->{InactiveDestroy} = 1; # Don't disconnect on fork/daemonize # Daemonize only after connectiong and doing --askpass. if ( $opts{daemonize} ) { require POSIX; chdir '/' or die "Can't chdir to /: $OS_ERROR"; open STDIN, '/dev/null' or die "Can't read /dev/null: $OS_ERROR"; open STDOUT, '>/dev/null' or die "Can't write to /dev/null: $OS_ERROR"; defined( my $pid = fork ) or die "Can't fork: $OS_ERROR"; exit if $pid; POSIX::setsid() or die "Can't start a new session: $OS_ERROR"; open STDERR, '>&STDOUT' or die "Can't dup STDOUT: $OS_ERROR"; } my $start_sql = version_ge($dbh, '4.0.5') ? 'START SLAVE' : 'SLAVE START'; if ( $opts{untilmaster} ) { my ( $file, $pos ) = split(',', $opts{untilmaster}); $start_sql .= " UNTIL MASTER_LOG_FILE = '$file', MASTER_LOG_POS = $pos"; } elsif ( $opts{untilrelay} ) { my ( $file, $pos ) = split(',', $opts{untilrelay}); $start_sql .= " UNTIL RELAY_LOG_FILE = '$file', RELAY_LOG_POS = $pos"; } my $fetch_stat = $dbh->prepare('SHOW SLAVE STATUS'); my $set_skip = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = $opts{k}"); my $start = $dbh->prepare($start_sql); my $exit_time = time() + ($opts{t} || 0); my $sleep = $opts{s}; my ($last_log, $last_pos); while ( ( !$opts{t} || time() < $exit_time ) ) { my $was_running = 1; $fetch_stat->execute(); my $stat = $fetch_stat->fetchall_arrayref({})->[0]; die "No SLAVE STATUS output found\n" unless $stat; if ( !$last_log || $last_log ne $stat->{relay_log_file} || $last_pos != $stat->{relay_log_pos} ) { $stat->{slave_sql_running} ||= 'No'; $stat->{last_error} ||= ''; $stat->{last_errno} ||= 0; if ( $opts{untilmaster} && pos_ge($stat, 'master') ) { die "Slave has advanced past $opts{untilmaster} on master.\n"; } elsif ( $opts{untilrelay} && pos_ge($stat, 'relay') ) { die "Slave has advanced past $opts{untilrelay} in relay logs.\n"; } if ( $stat->{slave_sql_running} eq 'No' ) { # Print the time, error, etc if ( $opts{v} ) { my $err = ''; if ( $opts{v} > 1 ) { ($err = $stat->{last_error} ) =~ s/\s+/ /g; if ( $opts{L} ) { $err = substr($err, 0, $opts{L}); } } printf("%s %s %11d %d %s\n", ts(time), $stat->{relay_log_file}, $stat->{relay_log_pos}, $stat->{last_errno} || 0, $err ); } if ( $opts{e} && !exists($opts{e}->{$stat->{last_errno}}) ) { die "Error $stat->{last_errno} is not in --error-numbers.\n"; } elsif ( $opts{E} && $stat->{last_error} && $stat->{last_error} !~ m/$opts{E}/ ) { die "Error does not match --error-text.\n"; } else { $set_skip->execute(); $start->execute(); $was_running = 0; # Only set this on events I tried to restart. Otherwise there # could be a race condition: I see it, I record it, but it hasn't # caused an error yet; so I won't try to restart it when it does. # (The point of this is to avoid trying to restart the same event # twice in case another race condition happens -- I restart it, # then check the server and it hasn't yet cleared the error # message and restarted the SQL thread). $last_log = $stat->{relay_log_file}; $last_pos = $stat->{relay_log_pos}; } } } # Adjust sleep time. if ( $was_running ) { $sleep = min($opts{M}, $sleep * 2); } else { $sleep = max($opts{m}, $sleep / 2); } # Errors are very likely to follow each other in quick succession. NOTE: # this policy has a side effect with respect to $sleep. Suppose $sleep is # 512 and MySQL Slave Restart finds an error; now $sleep is 256, but MySQL # Slave Restart only sleeps 1 (the initial value of --sleep). Suppose there # is no error when it wakes up after 1 second, because 1 was too short. Now # it doubles $sleep, back to 512. $sleep has the same value it did before # the error was ever found. print "sleeping $sleep\n" if $opts{v} > 2; sleep($was_running ? $sleep : min($sleep, $opts{s})); } # ############################################################################ # Subroutines. # ############################################################################ # Determines if the $stat's log coordinates are greater than or equal to the # desired coordinates. $which is 'master' or 'relay' sub pos_ge { my ( $stat, $which ) = @_; my $fmt = '%s/%020d'; my $curr = $which eq 'master' ? sprintf($fmt, @{$stat}{qw(relay_master_log_file exec_master_log_pos)}) : sprintf($fmt, @{$stat}{qw(relay_log_file relay_log_pos)}); my $stop = sprintf($fmt, split(',', $opts{"until$which"})); return $curr ge $stop; } # Compares versions like 5.0.27 and 4.1.15-standard-log sub version_ge { my ( $dbh, $target ) = @_; my $version = sprintf('%03d%03d%03d', $dbh->{mysql_serverinfo} =~ m/(\d+)/g); return $version ge sprintf('%03d%03d%03d', $target =~ m/(\d+)/g); } sub ts { my ( $time ) = @_; my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime($time); $mon += 1; $year += 1900; return sprintf("%d-%02d-%02dT%02d:%02d:%02d", $year, $mon, $mday, $hour, $min, $sec); } # ############################################################################ # Documentation. # ############################################################################ =pod =head1 NAME mysql-slave-restart - Watch and restart MySQL replication after errors. =head1 SYNOPSIS mysql-slave-restart --verbose =head1 DESCRIPTION MySQL Slave Restart watches a MySQL replication slave and tries to skip statements that cause errors. It polls the slave intelligently with an exponentially varying sleep time. You can specify errors to skip and run the slave until a certain binlog position. Note: it has come to my attention that Yahoo! had or has an internal tool called fix_repl, described to me by a past Yahoo! employee and mentioned in the first edition of High Performance MySQL. Apparently this tool does the same thing. Make no mistake, though: this is not a way to "fix replication." In fact I would not even encourage its use on a regular basis; I only use it when I have an error I know I just need to skip past. Indiscriminate use of this tool can easily screw up a server you might have had a chance to truly fix. You have been warned. =head1 OPTIONS =over =item --askpass Prompt for a password for the connection. =item --daemonize Fork to the background and detach from the shell. This probably doesn't work on Microsoft Windows. =item --database Database to use. =item --defaults-file Only read default options from the given file. =item --error-numbers Makes MySQL Slave Restart only try to restart if the error number is in this comma-separated list of errors. If it sees an error not in the list, it will exit. The error number is in the last_errno column of SHOW SLAVE STATUS. =item --error-text A Perl regular expression against which the error text, if any, is matched. If the error text exists and matches, MySQL Slave Restart will try to restart the slave. If it exists but doesn't match, MySQL Slave Restart will exit. The error text is in the last_error column of SHOW SLAVE STATUS. =item --error-length When L<"--verbose"> is set high enough to print the error, this option will truncate the error text to the specified length. This can be useful to prevent wrapping on the terminal. =item --help Show a brief help message and exit. =item --host Connect to host. =item --maxsleep The maximum time MySQL Slave Restart will sleep before polling the slave again. See L<"SLEEP">. =item --minsleep The minimum time MySQL Slave Restart will sleep before polling the slave again. See L<"SLEEP">. =item --password Password to use when connecting. =item --port Port number to use for connection. =item --skipcount The number of statements to skip when restarting the slave. =item --sleep The initial sleep time between checking the slave. See L<"SLEEP">. =item --socket Socket file to use for connection. =item --time Causes MySQL Slave Restart to stop after the specified time has elapsed. The argument can have a suffix of s, m, h, or d, indicating seconds, minutes, hours, or days. The number is interpreted as seconds if there is no suffix. =item --untilmaster Start the slave, and retry if it fails, until it reaches the given replication coordinates. The coordinates are the logfile and position on the master, given by relay_master_log_file, exec_master_log_pos. The argument must be in the format "file,pos". Separate the filename and position with a single comma and no space. This will also cause an UNTIL clause to be given to START SLAVE. After reaching this point, the slave should be stopped and MySQL Slave Restart will exit. =item --untilrelay Like L<"--untilmaster">, but in the slave's relay logs instead. The coordinates are given by relay_log_file, relay_log_pos. =item --user User for login if not current user. =item --verbose Verbosity; specify multiple times for more verbosity. Default is no output. Verbosity 1 outputs a timestamp, relay_log_file, relay_log_pos, and last_errno. Verbosity 2 adds last_error. See also L<"--error-length">. Verbosity 3 prints the current sleep time each time MySQL Slave Restart sleeps. =item --version Output version information and exit. =back =head1 SYSTEM REQUIREMENTS You need Perl, DBI, DBD::mysql, and some core packages that ought to be installed in any reasonably new version of Perl. =head1 OUTPUT If you specify --verbose, MySQL Slave Restart prints a line every time it sees the slave has an error. See L<"--verbose"> for details. =head1 SLEEP MySQL Slave Restart sleeps intelligently between polling the slave. The current sleep time varies. =over =item * The initial sleep time is given by L<"--sleep">. =item * If it checks and finds an error, it halves the previous sleep time. =item * If it finds no error, it doubles the previous sleep time. =item * The sleep time is bounded below by L<"--minsleep"> and above by L<"--maxsleep">. =item * Immediately after finding an error, MySQL Slave Restart assumes another error is very likely to happen next, so it sleeps the current sleep time or the initial sleep time, whichever is less. =back =head1 COMPATIBILITY MySQL Slave Restart should work on many versions of MySQL. Lettercase of many output columns from SHOW SLAVE STATUS has changed over time, so it treats them all as lowercase. =head1 BUGS If you find bugs, need features, etc please use the bug tracker, forums, and mailing lists at http://sourceforge.net/projects/mysqltoolkit. =head1 SEE ALSO See also L, L, L. =head1 COPYRIGHT, LICENSE AND WARRANTY This program is copyright (c) 2007 Baron Schwartz. Feedback and improvements are welcome. THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2; OR the Perl Artistic License. On UNIX and similar systems, you can issue `man perlgpl' or `man perlartistic' to read these licenses. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. =head1 AUTHOR Baron Schwartz =head1 VERSION This manual page documents Ver 1.0.1 Distrib 1053 $Revision: 863 $. =cut