Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster by=key(DT)[1] and keyby= index #3040

Merged
merged 10 commits into from
Sep 13, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ unsigned long long (*twiddle)(void *, int, int);
SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg);
bool need2utf8(SEXP x, int n);
SEXP isReallyReal(SEXP);
int getNumericRounding_C();

// reorder.c
SEXP reorder(SEXP x, SEXP order);
Expand Down
8 changes: 7 additions & 1 deletion src/forder.c
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ SEXP setNumericRounding(SEXP droundArg)
// init.c has initial call with default of 2
{
if (!isInteger(droundArg) || LENGTH(droundArg)!=1) error("Must an integer or numeric vector length 1");
if (INTEGER(droundArg)[0] < 0 || INTEGER(droundArg)[0] > 2) error("Must be 2 (default) or 1 or 0");
if (INTEGER(droundArg)[0] < 0 || INTEGER(droundArg)[0] > 2) error("Must be 2, 1 or 0");
dround = INTEGER(droundArg)[0];
dmask1 = dround ? 1 << (8*dround-1) : 0;
dmask2 = 0xffffffffffffffff << dround*8;
Expand All @@ -467,6 +467,12 @@ SEXP getNumericRounding()
return ScalarInteger(dround);
}

int getNumericRounding_C()
// for use in uniqlist.c
{
return dround;
}

static union {
double d;
unsigned long long ull;
Expand Down
107 changes: 76 additions & 31 deletions src/uniqlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,39 +22,84 @@ SEXP uniqlist(SEXP l, SEXP order)
iidx[0] = 1; // first row is always the first of the first group
byorder = INTEGER(order)[0] != -1;
// Using MISSING() does not seem stable under windows. Always having arguments passed in seems a good idea anyway.
thisi = byorder ? INTEGER(order)[0]-1 : 0;
for (i=1; i<nrow; i++) {
previ = thisi;
thisi = byorder ? INTEGER(order)[i]-1 : i;
j = ncol; // the last column varies the most frequently so check that first and work backwards
b = TRUE;
while (--j>=0 && b) {
v=VECTOR_ELT(l,j);
switch (TYPEOF(v)) {
case INTSXP : case LGLSXP : // NA_INTEGER==NA_LOGICAL checked in init.c
b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
case STRSXP :
// fix for #469, when key is set, duplicated calls uniqlist, where encoding
// needs to be taken care of.
b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
case REALSXP :
ulv = (unsigned long long *)REAL(v);
b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
if (!b) {
class = getAttrib(v, R_ClassSymbol);
twiddle = (isString(class) && STRING_ELT(class, 0)==char_integer64) ? &i64twiddle : &dtwiddle;
b = twiddle(ulv, thisi, 1) == twiddle(ulv, previ, 1);
}
break;
// TO DO: store previ twiddle call, but it'll need to be vector since this is in a loop through columns. Hopefully the first == will short circuit most often
default :
error("Type '%s' not supported", type2char(TYPEOF(v)));

if (ncol==1 && !byorder) {
// e.g. by=key(DT)[1]

#define COMPARE1 \
prev = *vd; \
for (int i=1; i<nrow; i++) { \
this = *++vd; \
if (this!=prev
#define COMPARE2 \
) { \
iidx[len++] = i+1; \
if (len>=isize) { \
isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \
iidx = Realloc(iidx, isize, int); \
} \
} \
prev = this; \
}

SEXP v = VECTOR_ELT(l,0);
switch(TYPEOF(v)) {
case INTSXP : case LGLSXP : {
int *vd=INTEGER(v), prev, this;
COMPARE1 COMPARE2
} break;
case STRSXP : {
SEXP *vd=DATAPTR(v), prev, this;
COMPARE1 && ENC2UTF8(this)!=ENC2UTF8(prev) COMPARE2 // but most of the time they are equal, so ENC2UTF8 doesn't need to be called
} break;
case REALSXP : {
uint64_t *vd=(uint64_t *)REAL(v), prev, this;
if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) {
COMPARE1 COMPARE2
} else {
COMPARE1 && dtwiddle(&this, 0, 1)!=dtwiddle(&prev, 0, 1) COMPARE2
}
} break;
default :
error("Type '%s' not supported", type2char(TYPEOF(v)));
}
if (!b) iidx[len++] = i+1;
if (len >= isize) {
isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));
iidx = Realloc(iidx, isize, int);
} else {
thisi = byorder ? INTEGER(order)[0]-1 : 0;
for (i=1; i<nrow; i++) {
previ = thisi;
thisi = byorder ? INTEGER(order)[i]-1 : i;
j = ncol; // the last column varies the most frequently so check that first and work backwards
b = TRUE;
while (--j>=0 && b) {
v=VECTOR_ELT(l,j);
switch (TYPEOF(v)) {
case INTSXP : case LGLSXP : // NA_INTEGER==NA_LOGICAL checked in init.c
b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
case STRSXP :
// fix for #469, when key is set, duplicated calls uniqlist, where encoding
// needs to be taken care of.
b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
case REALSXP :
ulv = (unsigned long long *)REAL(v);
b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
if (!b) {
class = getAttrib(v, R_ClassSymbol);
twiddle = (isString(class) && STRING_ELT(class, 0)==char_integer64) ? &i64twiddle : &dtwiddle;
b = twiddle(ulv, thisi, 1) == twiddle(ulv, previ, 1);
}
break;
// TO DO: store previ twiddle call, but it'll need to be vector since this is in a loop through columns. Hopefully the first == will short circuit most often
default :
error("Type '%s' not supported", type2char(TYPEOF(v)));
}
}
if (!b) {
iidx[len++] = i+1;
if (len >= isize) {
isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));
iidx = Realloc(iidx, isize, int);
}
}
}
}
PROTECT(ans = allocVector(INTSXP, len));
Expand Down