From e78b4d0d17c82212bc95ffd862f25bcab1873b3c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 7 Nov 2021 21:13:25 +0100 Subject: [PATCH 1/4] improve guess --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 +++++-- src/fread.c | 27 ++++++++++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5faf40723..8fa985f78 100644 --- a/NEWS.md +++ b/NEWS.md @@ -451,6 +451,8 @@ 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. +48. `fread()` makes now better guesses whether the file contains a header or not, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a8..7b5a363de 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11786,7 +11786,7 @@ test(1777.16, fread("A,B,3\nC,D,4\n", verbose=TRUE), data.table(V1=c("A","C"),V2 test(1777.17, fread("A,B,C\nD,E,F\n", verbose=TRUE), data.table(A="D", B="E", C="F"), output="because all columns are type string and a better guess is not possible") test(1777.18, fread("A,B,C\nC,D,4\n", verbose=TRUE), data.table(A="C",B="D",C=4L), - output="due to column 3 containing a string on row 1 and a lower type.*in the rest of the.*sample rows") + output="contain a string \\(or miss\\) on row 1 and a lower type in the rest of the.*sample rows") # unquoted fields containing \r, #2371 test(1778.1, fread("A,B,C\n0,,\n1,hello\rworld,2\n3,test,4\n", verbose=TRUE), @@ -12484,9 +12484,12 @@ if (test_R.utils) { } # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 -test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L))) +test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) +test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L))) +test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L))) +test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(A=NA, V2=c(300L,500L), V3=c(400L,600L))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" diff --git a/src/fread.c b/src/fread.c index be5097684..88809fb83 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1882,13 +1882,26 @@ int freadMain(freadMainArgs _args) { for (int j=0; j0) for (int j=0; j0) { + bool header_ltype = false; + int nHeaderLowerMiss = 0; + int nStringCol = 0; + for (int j=0; j 0 + if (nHeaderLowerMiss > ncol / 2) { + args.header=true; + if (verbose) DTPRINT(_(" 'header' determined to be true due to %d out of %d columns (%.2f%%) contain a string (or miss) on row 1 and a lower type in the rest of the %d sample rows\n"), + nHeaderLowerMiss, ncol, nHeaderLowerMiss * 100.0 / ncol, sampleLines); + } else if (nHeaderLowerMiss + nStringCol == ncol) { + args.header=true; + if (verbose) DTPRINT(_(" 'header' determined to be true due to %d columns contain a string (or miss) on row 1 and a lower type in the rest of the %d sample rows and other columns are of type string\n"), + nHeaderLowerMiss, sampleLines); + } } } } From dfac6b834eec575f89111aa31d33791fad98d8b7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 8 Nov 2021 00:11:28 +0100 Subject: [PATCH 2/4] clarify comment --- src/fread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 88809fb83..ce84e3d58 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1892,7 +1892,7 @@ int freadMain(freadMainArgs _args) { else if(tmpType[j]==CT_BOOL8_U) nHeaderLowerMiss++; } else if(type[j]==CT_STRING) nStringCol++; } - if (header_ltype) { // previous handling of header detection had args.header=true when header_lower > 0 + if (header_ltype) { // previous handling of header detection already set args.header=true when header_ltype==true if (nHeaderLowerMiss > ncol / 2) { args.header=true; if (verbose) DTPRINT(_(" 'header' determined to be true due to %d out of %d columns (%.2f%%) contain a string (or miss) on row 1 and a lower type in the rest of the %d sample rows\n"), From 2171c97fac1c6bdb8b6a63faa0324af7a58dcaec Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 29 Nov 2021 22:57:21 -0700 Subject: [PATCH 3/4] simpler fix by adding CT_EMPTY --- NEWS.md | 2 +- inst/tests/tests.Rraw | 6 +++--- src/fread.c | 41 +++++++++++++++++------------------------ src/fread.h | 3 ++- src/freadR.c | 6 +++--- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index 22e5e6c7b..f5fe08f92 100644 --- a/NEWS.md +++ b/NEWS.md @@ -252,7 +252,7 @@ # 4: 4 NA ``` -32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the data sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. +32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. ## BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f7b468427..8982342bc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11800,7 +11800,7 @@ test(1777.16, fread("A,B,3\nC,D,4\n", verbose=TRUE), data.table(V1=c("A","C"),V2 test(1777.17, fread("A,B,C\nD,E,F\n", verbose=TRUE), data.table(A="D", B="E", C="F"), output="because all columns are type string and a better guess is not possible") test(1777.18, fread("A,B,C\nC,D,4\n", verbose=TRUE), data.table(A="C",B="D",C=4L), - output="contain a string \\(or miss\\) on row 1 and a lower type in the rest of the.*sample rows") + output="due to column 3 containing a string on row 1 and a lower type.*in the rest of the.*sample rows") # unquoted fields containing \r, #2371 test(1778.1, fread("A,B,C\n0,,\n1,hello\rworld,2\n3,test,4\n", verbose=TRUE), @@ -12499,11 +12499,11 @@ if (test_R.utils) { # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) -test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L))) +test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L))) test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L))) -test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(A=NA, V2=c(300L,500L), V3=c(400L,600L))) +test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(NA,300L,500L), V3=c(NA,400L,600L))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" diff --git a/src/fread.c b/src/fread.c index 24b316aff..04df88d9c 100644 --- a/src/fread.c +++ b/src/fread.c @@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639 -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -1076,6 +1076,12 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) *target = NA_FLOAT64; } +static void parse_empty(FieldParseContext *ctx) +{ + int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; + *target = NA_BOOL8; +} + /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */ static void parse_bool_numeric(FieldParseContext *ctx) { @@ -1152,7 +1158,8 @@ static void parse_bool_lowercase(FieldParseContext *ctx) */ typedef void (*reader_fun_t)(FieldParseContext *ctx); static reader_fun_t fun[NUMTYPE] = { - (reader_fun_t) &Field, + (reader_fun_t) &Field, // CT_DROP + (reader_fun_t) &parse_empty, // CT_EMPTY (reader_fun_t) &parse_bool_numeric, (reader_fun_t) &parse_bool_uppercase, (reader_fun_t) &parse_bool_titlecase, @@ -1167,7 +1174,7 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &Field }; -static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int disabled_parsers[NUMTYPE] = {0}; static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present @@ -1882,26 +1889,12 @@ int freadMain(freadMainArgs _args) { for (int j=0; j0) { - bool header_ltype = false; - int nHeaderLowerMiss = 0; - int nStringCol = 0; - for (int j=0; j ncol / 2) { - args.header=true; - if (verbose) DTPRINT(_(" 'header' determined to be true due to %d out of %d columns (%.2f%%) contain a string (or miss) on row 1 and a lower type in the rest of the %d sample rows\n"), - nHeaderLowerMiss, ncol, nHeaderLowerMiss * 100.0 / ncol, sampleLines); - } else if (nHeaderLowerMiss + nStringCol == ncol) { - args.header=true; - if (verbose) DTPRINT(_(" 'header' determined to be true due to %d columns contain a string (or miss) on row 1 and a lower type in the rest of the %d sample rows and other columns are of type string\n"), - nHeaderLowerMiss, sampleLines); - } + if (sampleLines>0) for (int j=0; jCT_EMPTY) { + args.header=true; + if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), + j+1, typeName[type[j]], sampleLines); + break; } } } diff --git a/src/fread.h b/src/fread.h index 446da18e4..7035615a5 100644 --- a/src/fread.h +++ b/src/fread.h @@ -20,7 +20,8 @@ typedef enum { NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule - CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. + CT_EMPTY, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. EMPTY to help column heading guess, #5257 + CT_BOOL8_N, // int8_t CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, diff --git a/src/freadR.c b/src/freadR.c index 97fe691aa..786bb6e94 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "empty", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp; From 3dacb6d0f5472311fabdbeed830f5271fe9b8474 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 29 Nov 2021 23:05:02 -0700 Subject: [PATCH 4/4] typeRname empty->logical --- src/freadR.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/freadR.c b/src/freadR.c index 786bb6e94..82992aba3 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "empty", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp;