Merge pull request #94 from Chicago/dev

separated out the model from the evaluation, and ran on a basic RF model
Chicago · Apr 15, 2016 · 7afd83e · 7afd83e
2 parents 26f4d9e + e8e62d1
commit 7afd83e
Show file tree

Hide file tree

Showing 4 changed files with 439 additions and 91 deletions.
diff --git a/CODE/30_glmnet_model.R b/CODE/30_glmnet_model.R
@@ -51,7 +51,7 @@ xmat <- dat[ , list(Inspector = Inspector_Assigned,
                     heat_garbage = pmin(heat_garbage, 50),
                     # Facility_Type,
                     criticalFound),
-            keyby = Inspection_ID]
+             keyby = Inspection_ID]
 mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
 mm <- as.data.table(mm)
 str(mm)
@@ -74,107 +74,61 @@ nrow(mm)
 
 ##==============================================================================
 ## GLMNET MODEL
+## FOR MORE INFO SEE:
+## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
 ##==============================================================================
-# fit ridge regression, alpha = 0, only inspector coefficients penalized
-pen <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
-model <- glmnet(x = as.matrix(mm[iiTrain]),
-                y = xmat[iiTrain,  criticalFound],
-                family = "binomial", 
-                alpha = 0,
-                penalty.factor = pen)
-
-## See what regularization parameter 'lambda' is optimal on tune set
-## (We are essentially usin the previous hardcoded value)
-errors <- sapply(model$lambda, 
-                 function(lam) 
-                     logLik(p = predict(model, 
-                                        newx = as.matrix(mm[iiTrain]), 
-                                        s=lam, 
-                                        type="response")[,1], 
-                            y = xmat[iiTrain, criticalFound]))
-## Plot of the errors by lambda
-plot(x=log(model$lambda), y=errors, type="l")
-which.min(errors)
-model$lambda[which.min(errors)]
-## Manual selection of lambda
-w.lam <- 100
-lam <- model$lambda[w.lam]
-coef <- model$beta[,w.lam]
-inspCoef <- coef[grepl("^Inspector",names(coef))]
-inspCoef <- inspCoef[order(-inspCoef)]
-
-## Print coefficients for the Inspectors (and for other variables)
-inspCoef
-coef[!grepl("^Inspector",names(coef))]
-
-## Plot of the errors by Lambda for the out of sample Test data
-errorsTest <- sapply(model$lambda, 
-                     function(lam) 
-                         logLik(p = predict(model, 
-                                            newx = as.matrix(mm[iiTest]), 
-                                            s=lam, 
-                                            type="response")[,1], 
-                                y = xmat[iiTest, criticalFound]))
-plot(x=log(model$lambda), y=errorsTest, type="l")
-which.min(errorsTest)
-model$lambda[which.min(errorsTest)]
 
-## ATTACH PREDICTIONS TO DAT
-dat$score <- predict(model, newx=as.matrix(mm), 
-                     s=lam, 
-                     type="response")[,1]
-
-# Show gini performance of inspector model on tune data set
-dat[iiTest, gini(score, criticalFound, plot=TRUE)]
-
-## Calculate confusion matrix values for evaluation
-calculate_confusion_values(actual = xmat[iiTest, criticalFound],
-                           expected = dat[iiTest, score], 
-                           r = .25)
-
-## Calculate matrix of confusion matrix values for evaluation
-confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
-                                  calculate_confusion_values,
-                                  actual = xmat[iiTest, criticalFound],
-                                  expected = dat[iiTest, score]))
-confusion_values_test
-ggplot(reshape2::melt(as.data.table(confusion_values_test), 
-                      id.vars="r")) + 
-    aes(x=r, y=value, colour=variable) + geom_line() + 
-    geom_hline(yintercept = c(0,1))
+# fit ridge regression, alpha = 0, only inspector coefficients penalized
+penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
+
+## Find best lambda based on CV results
+## Note, The cvfit object includes the top model
+cvfit <- cv.glmnet(x = as.matrix(mm[iiTrain]),
+                   y = xmat[iiTrain,  criticalFound],
+                   family = "binomial", 
+                   alpha = 0,
+                   penalty.factor = penalty,
+                   type.measure = "deviance")
+
+## View of results
+plot(cvfit)
+cvfit$lambda
+cvfit$lambda.min
 
 ##==============================================================================
-## MODEL EVALUATION
-##    - TIME SAVINGS
-##    - PERIOD A vs PERIOD B
+## ATTACH PREDICTIONS TO DAT
 ##==============================================================================
-## Subset of just observations during test period:
-datTest <- dat[iiTest]
 
-## Mean time savings:
-datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)]
+## Attach predictions for top lambda choice to the data
+dat$score <- predict(cvfit$glmnet.fit, 
+                     newx = as.matrix(mm), 
+                     s = cvfit$lambda.min,
+                     type = "response")[,1]
 
-## Detailed time savings:
-bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)]
-bins
+## Identify each row as test / train
+dat$Test <- 1:nrow(dat) %in% iiTest
+dat$Train <- 1:nrow(dat) %in% iiTrain
 
-## This calculation is the weighted average date difference, which should match
-## the previous result from `simulated_bin_summary`
-bins[ , sum(as.integer(date) * POS) / sum(POS)] -
-    bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)]
+## Calculate scores for all lambda values
+allscores <- predict(cvfit$glmnet.fit, 
+                     newx = as.matrix(mm), 
+                     s = cvfit$glmnet.fit$lambda,
+                     type = "response")
 
-## Find the midpoint of the inspections to divide bins into period A & B
-mid <- bins[ , sum(N)/2]
+allscores <- as.data.table(allscores)
+setnames(allscores, 
+         cvfit$glmnet.fit$beta@Dimnames[[2]])
 
-## Divide the bins into period A & B based on midpoint
-## Note: GT and LT is strict, so ties would be excluded.  Although there are no
-##       ties for now (as of 2015 publication).
-binsA <- bins[NTOT < mid]
-binsB <- bins[NTOT > mid]
+## Identify each row as test / train
+allscores$Test <- 1:nrow(allscores) %in% iiTest
+allscores$Train <- 1:nrow(allscores) %in% iiTrain
 
-tot_crit <- sum(bins$POS)
+##==============================================================================
+## SAVE RESULTS
+##==============================================================================
 
-binsA[ , sum(POS)/tot_crit]     ## [1] 0.5465116
-binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
+saveRDS(dat, "DATA/30_glmnet_data.Rds")
+saveRDS(cvfit, "DATA/30_glmnet_cvfit.Rds")
+saveRDS(allscores, "DATA/30_glmnet_allscores.Rds")
 
 
diff --git a/CODE/30_random_forest_model.R b/CODE/30_random_forest_model.R
@@ -0,0 +1,98 @@
+
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+if(interactive()){
+    ## Remove all objects; perform garbage collection
+    rm(list=ls())
+    gc(reset=TRUE)
+    ## Detach libraries that are not used
+    geneorama::detach_nonstandard_packages()
+}
+## Load libraries that are used
+geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2"))
+## Load custom functions
+geneorama::sourceDir("CODE/functions/")
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+dat <- readRDS("DATA/dat_model.Rds")
+
+## Only keep "Retail Food Establishment"
+dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
+## Remove License Description
+dat[ , LICENSE_DESCRIPTION := NULL]
+dat <- na.omit(dat)
+
+## Add criticalFound variable to dat:
+dat[ , criticalFound := pmin(1, criticalCount)]
+
+## Set the key for dat
+setkey(dat, Inspection_ID)
+
+## Match time period of original results
+# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"]
+
+##==============================================================================
+## CREATE MODEL DATA
+##==============================================================================
+# sort(colnames(dat))
+xmat <- dat[ , list(Inspector = Inspector_Assigned,
+                    pastSerious = pmin(pastSerious, 1),
+                    pastCritical = pmin(pastCritical, 1),
+                    timeSinceLast,
+                    ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
+                    consumption_on_premises_incidental_activity,
+                    tobacco_retail_over_counter,
+                    temperatureMax,
+                    heat_burglary = pmin(heat_burglary, 70),
+                    heat_sanitation = pmin(heat_sanitation, 70),
+                    heat_garbage = pmin(heat_garbage, 50),
+                    # Facility_Type,
+                    criticalFound),
+             keyby = Inspection_ID]
+mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
+mm <- as.data.table(mm)
+str(mm)
+colnames(mm)
+
+##==============================================================================
+## CREATE TEST / TRAIN PARTITIONS
+##==============================================================================
+## 2014-07-01 is an easy separator
+dat[Inspection_Date < "2014-07-01", range(Inspection_Date)]
+dat[Inspection_Date > "2014-07-01", range(Inspection_Date)]
+
+iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")]
+iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
+
+## Check to see if any rows didn't make it through the model.matrix formula
+nrow(dat)
+nrow(xmat)
+nrow(mm)
+
+##==============================================================================
+## RANDOM FOREST MODEL
+##==============================================================================
+model <- randomForest(x = as.matrix(mm[iiTrain]),
+                      y = as.factor(xmat[iiTrain, criticalFound]),
+                      importance=TRUE)
+
+## ATTACH PREDICTIONS TO DAT
+dat$score <- predict(model, as.matrix(mm),
+                    type="prob")[ , 2]
+
+## Identify each row as test / train
+dat$Test <- 1:nrow(dat) %in% iiTest
+dat$Train <- 1:nrow(dat) %in% iiTrain
+
+##==============================================================================
+## SAVE RESULTS
+##==============================================================================
+
+saveRDS(dat, "DATA/30_random_forest_data.Rds")
+saveRDS(model, "DATA/30_random_forest_model.Rds")
+
+
+