diff --git a/NEWS.md b/NEWS.md index 6a218bb6b..17d662220 100644 --- a/NEWS.md +++ b/NEWS.md @@ -137,6 +137,9 @@ 21. `melt()` on a `data.table` with `list` columns for `measure.vars` would silently ignore `na.rm=TRUE`, [#5044](https://github.com/Rdatatable/data.table/issues/5044). Now the same logic as `is.na()` from base R is used; i.e. if list element is scalar NA then it is considered missing and removed. Thanks to Toby Dylan Hocking for the PRs. +22. `fread(fill=TRUE)` could segfault if the input contained an improperly quoted character field, [#4774](https://github.com/Rdatatable/data.table/issues/4774) [#5041](https://github.com/Rdatatable/data.table/issues/5041). Thanks to @AndeolEvain and @e-nascimento for reporting and to Václav Tlapák for the PR. + +23. `fread(fill=TRUE, verbose=TRUE)` would segfault on the out-of-sample type bump verbose output if the input did not contain column names, [5046](https://github.com/Rdatatable/data.table/pull/5046). Thanks to Václav Tlapák for the PR. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c1a6b1ca3..21972d44b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17763,3 +17763,15 @@ test(2195.5, duplicated(DT, by=character(0L)), ans<-c(FALSE, FALSE, TRUE, FALSE) test(2195.6, duplicated(DT, by=NULL), ans) test(2195.7, anyDuplicated(DT, by=character(0L)), 3L) test(2195.8, anyDuplicated(DT, by=NULL), 3L) + +# Improperly quoted character field with fill=TRUE would segfault, #4774 and #5041 +test(2196, + fread(paste0(paste(rep(c('a; b'), 100), collapse='\n'), c('\n"a" 2;b')), fill=TRUE, quote='\"'), + data.table(a=c(rep('a', 99), '"a" 2'), b=rep('b', 100)), warning='Found and resolved improper quoting') + +# Verbose output would segfault when no header present, out-of-sample type error and fill=TRUE, similar to #4644 +# Test vaildity may depend on sampling behaviour of fread since the type bump needs to occur out of sample to trigger the segfault +sampleText = paste0(paste(rep(c('1; 2'), 100), collapse='\n'), c('\n"a";2\n1; 2\n'), paste(rep(c('1; 2'), 100), collapse='\n')) +test(2197, fread(sampleText, fill=TRUE, quote='\"', verbose=TRUE, header=FALSE), + data.table(rep(c("1","a","1"),c(100,1,101)), 2L), + output='Column 1 bumped') diff --git a/src/fread.c b/src/fread.c index da3e0e18e..d00cffcd1 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2361,6 +2361,9 @@ int freadMain(freadMainArgs _args) { if (j+fieldsRemaining != ncol) break; checkedNumberOfFields = true; } + if (thisType <= -NUMTYPE) { + break; // Improperly quoted char field needs to be healed below, other columns will be filled #5041 and #4774 + } #pragma omp critical { joldType = type[j]; // fetch shared value again in case another thread bumped it while I was waiting. @@ -2369,8 +2372,8 @@ int freadMain(freadMainArgs _args) { if (verbose) { char temp[1001]; int len = snprintf(temp, 1000, - _("Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row %"PRIu64"\n"), - j+1, colNames[j].len, colNamesAnchor + colNames[j].off, + _("Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %"PRIu64"\n"), + j+1, colNames?" <<":"", colNames?(colNames[j].len):0, colNames?(colNamesAnchor+colNames[j].off):"", colNames?">>":"", typeName[abs(joldType)], typeName[abs(thisType)], (int)(tch-fieldStart), fieldStart, (uint64_t)(ctx.DTi+myNrow)); if (len > 1000) len = 1000;