Skip to content

Commit

Permalink
Merge pull request #626 from internetarchive/checkpoint-on-shutdown
Browse files Browse the repository at this point in the history
Add checkpoint on shutdown and --checkpoint CLI option
  • Loading branch information
ato authored Nov 28, 2024
2 parents 8ec6142 + ec689db commit 4093871
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 7 deletions.
18 changes: 16 additions & 2 deletions docs/operating.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Command-line Options
-b, --web-bind-hosts HOST
Specifies a comma-separated list of hostnames/IP-addresses to bind to the Web UI. You may use '/' as a
shorthand for 'all addresses'. **Default**: ``localhost/127.0.0.1``
-c,--checkpoint ARG
Recovers from the given checkpoint. May only be used with the --run-job option. The special value 'latest'
will recover the last checkpoint or if none exist will launch a new crawl.
-j, --job-dirs PATH
Sets the directory Heritrix stores jobs in. **Default:** ``$HERITRIX_HOME/jobs``
-l, --logging-properties PATH
Expand Down Expand Up @@ -685,11 +688,18 @@ To configure Heritrix to automatically run checkpoints periodically, set the
<bean id="checkpointService" class="org.archive.crawler.framework.CheckpointService">
<property name="checkpointIntervalMinutes" value="60"/>
<property name="checkpointOnShutdown" value="true"/>
<!-- <property name="checkpointsDir" value="checkpoints"/> -->
<!-- <property name="forgetAllButLatest" value="true"/> -->
<property name="forgetAllButLatest" value="true"/>
</bean>
By default only the latest checkpoint will be kept.
When ``checkpointOnShutdown`` is enabled Heritrix will create a checkpoint if the job is running when the JVM is
gracefully shutdown. Note that if Heritrix is killed, crashes or the server it is running on unexpectedly loses
power the shutdown checkpoint will not be created. Consequently it may be ideal to enable both shutdown and interval
checkpoints together.

Setting ``forgetAllButLatest``` will ensure only the latest checkpoint is kept.


Restarting from a Checkpoint
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -708,6 +718,10 @@ The web UI provides an option to restart a crawl from a checkpoint:

The job will now begin running from the chosen checkpoint.

When running a job from the command-line with the ``--run-job`` CLI option you can use the ``--checkpoint`` to restart
the job from a named checkpoint. The special name ``latest`` will restart from the latest checkpoint if any exist,
otherwise it will launch a new crawl.

Crawl Recovery
--------------

Expand Down
22 changes: 20 additions & 2 deletions engine/src/main/java/org/archive/crawler/Heritrix.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ private static Options options() {
"\"password\" (which leaves username as the default 'admin'), " +
"\"username:password\", or \"@filename\" for a file that " +
"includes the single line \"username:password\". ");
options.addOption("c", "checkpoint", true,
"Recovers from the given checkpoint. May only be used with the " +
"--run-job option. The special value 'latest' will recover the " +
"last checkpoint or if none exist will launch a new crawl.");
options.addOption("j", "jobs-dir", true, "The jobs directory. " +
"Defaults to ./jobs");
options.addOption("l", "logging-properties", true,
Expand Down Expand Up @@ -265,6 +269,11 @@ public void instanceMain(String[] args)
System.exit(1);
authPassword = ""; // suppresses uninitialized warning
}

if (cl.hasOption('c') && !cl.hasOption('r')) {
System.err.println("Cannot use --checkpoint without --run-job.");
System.exit(1);
}

File jobsDir = null;
if (cl.hasOption('j')) {
Expand Down Expand Up @@ -374,12 +383,21 @@ public void instanceMain(String[] args)
}
if (cl.hasOption('r')) {
String jobName = cl.getOptionValue('r');
engine.requestLaunch(jobName);
CrawlJob job = engine.getJob(jobName);
if (job == null || job.getCrawlController() == null) {
if (job == null) {
System.err.println("Job not found: " + jobName);
System.exit(1);
}
job.validateConfiguration();
if (cl.hasOption('c')) {
job.getCheckpointService().setRecoveryCheckpointByName(cl.getOptionValue('c'));
}
job.launch();
if (job.getCrawlController() == null) {
System.err.println("Failed to launch job: " + jobName);
System.exit(1);
}

job.getCrawlController().requestCrawlResume();
engine.waitForNoRunningJobs(0);
engine.shutdown();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
* build and before launch (setRecoveryCheckpointByName).
*
* Offers optional automatic checkpointing at a configurable interval
* in minutes.
* in minutes.
*
* @author stack
* @author gojomo
Expand All @@ -80,6 +80,8 @@ public class CheckpointService implements Lifecycle, ApplicationContextAware, Ha
protected TimerTask checkpointTask = null;
protected ConfigPath checkpointsDir =
new ConfigPath("checkpoints subdirectory","checkpoints");
protected Thread shutdownHook;

public ConfigPath getCheckpointsDir() {
return checkpointsDir;
}
Expand All @@ -106,6 +108,21 @@ public void setCheckpointIntervalMinutes(long interval) {
setupCheckpointTask();
}
}

protected boolean checkpointOnShutdown = false;

public boolean getCheckpointOnShutdown() {
return checkpointOnShutdown;
}

/**
* Whether a checkpoint should be made when the JVM is shutdown.
* Default is false.
*/
public void setCheckpointOnShutdown(boolean checkpointOnShutdown) {
this.checkpointOnShutdown = checkpointOnShutdown;
setupShutdownHook();
}

protected boolean forgetAllButLatest = false;
public boolean getForgetAllButLatest() {
Expand Down Expand Up @@ -179,6 +196,7 @@ public synchronized void start() {
}
this.isRunning = true;
setupCheckpointTask();
setupShutdownHook();
}

/**
Expand Down Expand Up @@ -221,12 +239,31 @@ public void run() {
public synchronized boolean isRunning() {
return isRunning;
}


protected synchronized void setupShutdownHook() {
if (!checkpointOnShutdown || shutdownHook != null || !isRunning) return;
shutdownHook = new Thread(() -> {
if (!checkpointOnShutdown) return;
LOGGER.info("Checkpointing on shutdown");
try {
// pause first to ensure no crawling occurs between the checkpoint and process termination
controller.requestCrawlPause();
requestCrawlCheckpoint();
} catch (Exception e) {
LOGGER.severe("Failed to checkpoint on shutdown: " + e.getMessage());
}
});
Runtime.getRuntime().addShutdownHook(shutdownHook);
}

public synchronized void stop() {
LOGGER.info("Cleaned up Checkpoint TimerThread.");
this.timer.cancel();
this.isRunning = false;
this.isRunning = false;
if (shutdownHook != null) {
Runtime.getRuntime().removeShutdownHook(shutdownHook);
shutdownHook = null;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ http://example.example/example
<bean id="checkpointService"
class="org.archive.crawler.framework.CheckpointService">
<!-- <property name="checkpointIntervalMinutes" value="-1"/> -->
<!-- <property name="checkpointOnShutdown" value="true"/> -->
<!-- <property name="checkpointsDir" value="checkpoints"/> -->
<!-- <property name="forgetAllButLatest" value="true"/> -->
</bean>
Expand Down

0 comments on commit 4093871

Please sign in to comment.