This patch fixes bug #947 where a large number of emails were generated because the heartbeat interval was small and the tape in the drive did not correspond to the one wanted by Bacula. Apply the patch to version 2.2.5 (and probably any 2.2.x version) with: cd ./configure patch -p0 <2.2.5-hb.patch make ... make install Index: src/stored/wait.c =================================================================== --- src/stored/wait.c (revision 5814) +++ src/stored/wait.c (working copy) @@ -40,9 +40,8 @@ #include "bacula.h" /* pull in global headers */ #include "stored.h" /* pull in Storage Deamon headers */ -//static bool double_jcr_wait_time(JCR *jcr); +const int dbglvl = 400; - /* * Wait for SysOp to mount a tape on a specific device * @@ -62,7 +61,7 @@ JCR *jcr = dcr->jcr; dev->dlock(); - Dmsg1(100, "Enter blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Enter blocked=%s\n", dev->print_blocked()); unmounted = is_device_unmounted(dev); dev->poll = false; @@ -84,27 +83,28 @@ } if (!unmounted) { - Dmsg1(400, "blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "blocked=%s\n", dev->print_blocked()); dev->dev_prev_blocked = dev->blocked(); dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */ } for ( ; !job_canceled(jcr); ) { - time_t now, start; + time_t now, start, total_waited; gettimeofday(&tv, &tz); timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + add_wait; - Dmsg4(400, "I'm going to sleep on device %s. HB=%d wait=%d add_wait=%d\n", - dev->print_name(), (int)me->heartbeat_interval, dev->wait_sec, add_wait); + Dmsg4(dbglvl, "I'm going to sleep on device %s. HB=%d rem_wait=%d add_wait=%d\n", + dev->print_name(), (int)me->heartbeat_interval, dev->rem_wait_sec, add_wait); start = time(NULL); /* Wait required time */ stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->m_mutex, &timeout); - Dmsg2(400, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, + Dmsg2(dbglvl, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, dev->print_blocked()); now = time(NULL); + total_waited = now - first_start; dev->rem_wait_sec -= (now - start); /* Note, this always triggers the first time. We want that. */ @@ -113,7 +113,7 @@ /* send heartbeats */ if (jcr->file_bsock) { jcr->file_bsock->signal(BNET_HEARTBEAT); - Dmsg0(400, "Send heartbeat to FD.\n"); + Dmsg0(dbglvl, "Send heartbeat to FD.\n"); } if (jcr->dir_bsock) { jcr->dir_bsock->signal(BNET_HEARTBEAT); @@ -131,7 +131,7 @@ if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */ - Dmsg0(400, "Exceed wait time.\n"); + Dmsg0(dbglvl, "Exceed wait time.\n"); stat = W_TIMEOUT; break; } @@ -142,8 +142,8 @@ unmounted = is_device_unmounted(dev); if (!unmounted && dev->vol_poll_interval && - (now - first_start >= dev->vol_poll_interval)) { - Dmsg1(400, "In wait blocked=%s\n", dev->print_blocked()); + (total_waited >= dev->vol_poll_interval)) { + Dmsg1(dbglvl, "poll return in wait blocked=%s\n", dev->print_blocked()); dev->poll = true; /* returning a poll event */ stat = W_POLL; break; @@ -152,6 +152,7 @@ * Check if user mounted the device while we were waiting */ if (dev->blocked() == BST_MOUNT) { /* mount request ? */ + Dmsg0(dbglvl, "Mounted return.\n"); stat = W_MOUNT; break; } @@ -160,30 +161,39 @@ * If we did not timeout, then some event happened, so * return to check if state changed. */ - if (stat != 0) { + if (stat != ETIMEDOUT) { + berrno be; + Dmsg2(dbglvl, "Wake return. stat=%d. ERR=%s\n", stat, be.bstrerror(stat)); stat = W_WAKE; /* someone woke us */ break; } /* * At this point, we know we woke up because of a timeout, - * that was due to a heartbeat, so we just update - * the wait counters and continue. + * that was due to a heartbeat, because any other reason would + * have caused us to return, so update the wait counters and continue. */ - add_wait = dev->wait_sec - (now - start); + add_wait = dev->rem_wait_sec; + if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { + add_wait = me->heartbeat_interval; + } + /* If the user did not unmount the tape and we are polling, ensure + * that we poll at the correct interval. + */ + if (!unmounted && dev->vol_poll_interval && + add_wait > dev->vol_poll_interval - total_waited) { + add_wait = dev->vol_poll_interval - total_waited; + } if (add_wait < 0) { add_wait = 0; } - if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { - add_wait = me->heartbeat_interval; - } } if (!unmounted) { dev->set_blocked(dev->dev_prev_blocked); /* restore entry state */ - Dmsg1(400, "set %s\n", dev->print_blocked()); + Dmsg1(dbglvl, "set %s\n", dev->print_blocked()); } - Dmsg1(400, "Exit blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Exit blocked=%s\n", dev->print_blocked()); dev->dunlock(); return stat; } @@ -209,7 +219,7 @@ const int max_wait_time = 1 * 60; /* wait 1 minute */ char ed1[50]; - Dmsg0(100, "Enter wait_for_device\n"); + Dmsg0(dbglvl, "Enter wait_for_device\n"); P(device_release_mutex); if (++retries % 5 == 0) { @@ -222,14 +232,14 @@ timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + max_wait_time; - Dmsg1(100, "JobId=%u going to wait for a device.\n", (uint32_t)jcr->JobId); + Dmsg0(dbglvl, "Going to wait for a device.\n"); /* Wait required time */ stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout); - Dmsg2(100, "JobId=%u wokeup from sleep on device stat=%d\n", (uint32_t)jcr->JobId, stat); + Dmsg1(dbglvl, "Wokeup from sleep on device stat=%d\n", stat); V(device_release_mutex); - Dmsg2(100, "JobId=%u return from wait_device ok=%d\n", (uint32_t)jcr->JobId, ok); + Dmsg1(dbglvl, "Return from wait_device ok=%d\n", ok); return ok; }