Skip to content

Commit

Permalink
additional failure logic:
Browse files Browse the repository at this point in the history
- logger.fatal() also sets crawl status to 'failed'
- add 'failOnFailedLimit' to set crawl status to 'failed' if number of failed pages exceeds limit, refactored from #393
  • Loading branch information
ikreymer committed Oct 3, 2023
1 parent a23f840 commit 235f963
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 15 deletions.
35 changes: 21 additions & 14 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -329,15 +329,9 @@ export class Crawler {
}

} finally {
logger.info(`Final crawl status: ${status}`);
logger.info(`Crawl status: ${status}`);

if (this.crawlState) {
await this.crawlState.setStatus(status);
}

await this.closeLog();

await this.setEndTimeAndExit(exitCode);
await this.setEndTimeAndExit(exitCode, status);
}
}

Expand Down Expand Up @@ -698,6 +692,13 @@ self.__bx_behaviors.selectMainBehavior();
}
}

if (this.params.failOnFailedLimit) {
const numFailed = this.crawlState.numFailed();
if (numFailed >= this.params.failOnFailedLimit) {
logger.fatal(`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`);
}
}

if (interrupt) {
this.uploadAndDeleteLocal = true;
this.gracefulFinishOnInterrupt();
Expand All @@ -712,18 +713,26 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async setEndTimeAndExit(exitCode = 0) {
async setEndTimeAndExit(exitCode, status) {
await this.closeLog();

if (this.crawlState) {
if (status) {
await this.crawlState.setStatus(status);
}
await this.crawlState.setEndTime();
}
process.exit(exitCode);
}

async serializeAndExit() {
await this.serializeConfig();
await this.closeLog();

await this.setEndTimeAndExit(this.interrupted ? 13 : 0);
if (this.interrupted) {
await this.setEndTimeAndExit(13, "interrupted");
} else {
await this.setEndTimeAndExit(0, "done");
}
}

async isCrawlRunning() {
Expand Down Expand Up @@ -948,10 +957,8 @@ self.__bx_behaviors.selectMainBehavior();
if ((await this.crawlState.numDone()) > 0) {
return;
}
// stopped and no done pages, mark crawl as failed
await this.crawlState.setStatus("failed");
}
// fail for now, may restart to try again
// fail crawl otherwise
logger.fatal("No WARC Files, assuming crawl failed");
}

Expand Down
6 changes: 6 additions & 0 deletions util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,12 @@ class ArgParser {
default: false
},

"failOnFailedLimit": {
describe: "If set, save state and exit if number of failed pages exceeds this value",
type: "number",
default: 0,
},

"customBehaviors": {
describe: "injects a custom behavior file or set of behavior files in a directory",
type: ["string"]
Expand Down
7 changes: 6 additions & 1 deletion util/logger.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,13 @@ class Logger
fatal(message, data={}, context="general", exitCode=17) {
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");

async function markFailedAndEnd(crawlState) {
await crawlState.setStatus("failed");
await crawlState.setEndTime();
}

if (this.crawlState) {
this.crawlState.setEndTime().finally(process.exit(exitCode));
markFailedAndEnd(this.crawlState).finally(process.exit(exitCode));
} else {
process.exit(exitCode);
}
Expand Down

0 comments on commit 235f963

Please sign in to comment.