Excel, R and Tableau analysis of Predictive Model comparison with Emergency department data. Within this analysis, the information from one month in an emergency department has been recorded and will be analyzed through different means in different IDEs.
The written analysis describes the overall issue, its existing recording metrics, findings and solutions. Additionally I have included the recorded powerpoint presentation of the finding and alternate visuals created while reviewing the data.
This code will use R to explore the predictive qualities of Support Vector Machine (SVM), Naive Bayes, and Random Forest in relation to emergency department overcrowding.
Data file (this was cleaned and brought into R)
Packages
install.packages("tidyverse")
install.packages("descr")
install.packages("forcats")
install.packages("e1071")
install.packages("caret")
install.packages("lattice")
install.packages("caTools")
install.packages("rmarkdown")
install.packages("ggplot2")
install.packages("corrplot")
install.packages("randomForest")
install.packages("plyr")
install.packages("ROSE")
install.packages("readxl")
install.packages("lubridate")
install.packages("zoo")
install.packages("magrittr")
install.packages("ROCR")
install.packages("pROC")
install.packages("readr")
library(plyr)
library(dplyr)
library(tidyverse)
library(descr) # Descriptive Statistics
library(ggplot2)
library(forcats) # Tools for Working with Categorical Variables (Factors)
library(e1071) # Misc Functions of the Department of Statistics, Probability Theory Group
library(lattice)
library(caret) # Classification and Regression Training
library(caTools)
library(rmarkdown)
library(ggplot2)
library(corrplot)
library(randomForest)
library(ROSE)
library(readxl)
library(lubridate)
library(zoo)
library(magrittr)
library(ROCR)
library(pROC)
library(readr)
Load file – reload same file
nedoc_data <- read.csv(file.choose())
dfilter1 <- read.csv(file.choose())
Pdfilter1 <- read.csv(file.choose())
#PD filter is for predictive
#D filter is for general analysis and exploratory predictive
Data Prep
#####-----------------------DATA PREP
dfilter1$OVR <- as.factor(dfilter1$OVR)
dfilter1$SEV <- as.factor(dfilter1$SEV)
dfilter1$DNG <- as.factor(dfilter1$DNG)
dfilter1$Hour <- as.factor(dfilter1$Hour)
dfilter1$Day <- as.factor(dfilter1$Day)
dfilter1$Week <- as.factor(dfilter1$Week)
DNGfile <- filter(dfilter1, dfilter1$NAVG == 4)
SEVfile <- filter(dfilter1, dfilter1$NAVG == 3)
OVRfile <- filter(dfilter1, dfilter1$NAVG == 2)
NE_Week <- filter(dfilter1, dfilter1$BSY == 1, dfilter1$DNG == 0, dfilter1$Week %in% c(2,3,4))
NE_Week2 <- filter(dfilter1, dfilter1$DNG == 0, dfilter1$Week %in% c(1,2,3,4))
#Can use regular filter file because after clean I move data into different files for pred
#move OVR to front with Date so that you can have both Naive and SVM with same dataset
Pdfilter1$OVR <- as.factor(Pdfilter1$OVR)
Pdfilter1$Hour <- as.factor(Pdfilter1$Hour)
Pdfilter1$Day <- as.factor(Pdfilter1$Day)
Pdfilter1$Week <- as.factor(Pdfilter1$Week)
Pdfilter1$OVRcut <- ifelse(Pdfilter1$OVR == 0,"good","bad")
Pdfilter1$OVRcut <- as.factor(Pdfilter1$OVRcut)
summary(Pdfilter1$DEP)
Pdfilter1$DEPcut <- cut(Pdfilter1$DEP, breaks=c(20,43,76,109),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(Pdfilter1$DEPcut)
summary(Pdfilter1$EDW)
Pdfilter1$EDWcut <- cut(Pdfilter1$EDW, breaks=c(0,6,14,29),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(Pdfilter1$EDWcut)
summary(Pdfilter1$CC)
Pdfilter1$CCcut <- cut(Pdfilter1$CC, breaks=c(0,3.5,7,14),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(Pdfilter1$CCcut)
summary(Pdfilter1$DTB)
Pdfilter1$DTBcut <- cut(Pdfilter1$DTB, breaks=c(-0.07,0.03,0.8825,6.76),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(Pdfilter1$DTBcut)
summary(Pdfilter1$LAT)
Pdfilter1$LAT2 <- Pdfilter1$LAT
Pdfilter1$LAT2[Pdfilter1$LAT2 == 0.00] <- 0.01
Pdfilter1$LATcut <- cut(Pdfilter1$LAT2, breaks=c(0.00,1.63,6.562,30.02),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(Pdfilter1$LATcut)
str(Pdfilter1)
SVMD1 <- Pdfilter1
SVMD1$OVR <- as.factor(SVMD$OVR)
SVMD1$Hour <- as.factor(SVMD$Hour)
SVMD1$Day <- as.factor(SVMD$Day)
SVMD1$Week <- as.factor(SVMD$Week)
NaiveD1 <- Pdfilter1
str(NaiveD1)
NaiveD1$OVRcut <- ifelse(NaiveD1$OVR == 0,"good","bad")
NaiveD1$OVRcut <- as.factor(NaiveD1$OVRcut)
summary(NaiveD1$DEP)
NaiveD1$DEPcut <- cut(NaiveD1$DEP, breaks=c(20,43,76,109),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(NaiveD1$DEPcut)
summary(NaiveD1$EDW)
NaiveD1$EDWcut <- cut(NaiveD1$EDW, breaks=c(0,6,14,29),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(NaiveD1$EDWcut)
summary(NaiveD1$CC)
NaiveD1$CCcut <- cut(NaiveD1$CC, breaks=c(0,3.5,7,14),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(NaiveD1$CCcut)
summary(NaiveD1$DTB)
NaiveD1$DTBcut <- cut(NaiveD1$DTB, breaks=c(-0.07,0.03,0.8825,6.76),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(NaiveD1$DTBcut)
summary(NaiveD1$LAT)
NaiveD1$LATcut <- cut(NaiveD1$LAT, breaks=c(0.0,1.63,6.562,30.02),
labels=c("bottom 25%","middle 50%", "top 25%"))
table(NaiveD1$LATcut)
SVMData <- select(SVMD1, DEP, EDW, CC, DTB, LAT, Hour, Day, Week, OVR)
NaiveData <- select(NaiveD1,OVRcut, DEPcut,EDWcut,CCcut,DTBcut,LATcut,Hour, Day, Week)
Data Viz pre-predictive
#############------------------DATA VIZ
#Distribution of NeDoc Scores
DViz <- ggplot(data=dfilter1, aes(x=AVG)) +
geom_histogram(aes(y=..density..),
col='black',
fill='dodgerblue1',
alpha=0.3) +
geom_density(adjust=3)
print(DViz + theme(plot.title=element_text(face="bold")) + ggtitle('Distribution of the NeDoc Averages'))
#NE_Week2 Data viz of distribution Shows more balanced avg without week 5 or Danger (only in week 1)
DVizNE <- ggplot(data=NE_Week2, aes(x=AVG)) +
geom_histogram(aes(y=..density..),
col='black',
fill='dodgerblue1',
alpha=0.3) +
geom_density(adjust=3)
print(DVizNE + theme(plot.title=element_text(face="bold")) + ggtitle('NE_Week Distribution of the NeDoc Averages'))
##hR CHART SHOWS GOOD REP FOR OVR
NAVGdist <- ggplot(data=dfilter1, aes(x=dfilter1$Hour, fill=OVR)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position='stack', alpha=0.5) + scale_y_continuous(labels=scales::percent)
print(NAVGdist + theme(plot.title=element_text(face="bold")) + ggtitle('Hour to Overcrowded'))
##Pretty cool Day Chart
NAVGdist <- ggplot(data=dfilter1, aes(x=dfilter1$Day, fill=OVR)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position='stack', alpha=0.5) + scale_y_continuous(labels=scales::percent)
print(NAVGdist + theme(plot.title=element_text(face="bold")) + ggtitle('Day to Overcrowded'))
#good rep of decline in overcrowded
NAVGdist <- ggplot(data=dfilter1, aes(x=dfilter1$Week, fill=OVR)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position='stack', alpha=0.5) + scale_y_continuous(labels=scales::percent)
print(NAVGdist + theme(plot.title=element_text(face="bold")) + ggtitle('Week to Overcrowded'))
#Dep Higher More OVR, cc, LAT (EDW, DTB even distribution)
#Dont know which charts to use but here is the build just need to change the name
NAVGdist <- ggplot(data=dfilter1, aes(x=dfilter1$Week, fill=OVR)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position='stack', alpha=0.5) + scale_y_continuous(labels=scales::percent)
print(NAVGdist + theme(plot.title=element_text(face="bold")) + ggtitle('Home Ownership vs Loan Default'))
Establishing train and test datasets to be used through different models
#########------------------PREDICTIVE
#Establish train and test models
##I think I can have 1 full file and identify different test pieces below
Predictivedata <- Pdfilter1
set.seed(101)
PredData <- sample.split(Predictivedata$OVR, SplitRatio = 0.7)
PredDatatrain <- subset(Predictivedata, PredData == TRUE)
PredDatatest <- subset(Predictivedata, PredData == FALSE)
AASVMDatatrain <- select(PredDatatrain, DEP, EDW, CC, DTB, LAT, Hour, Day, Week, OVR)
AASVMDatatest <- select(PredDatatest, DEP, EDW, CC, DTB, LAT, Hour, Day, Week, OVR)
AANaiveDatatrain <- select(PredDatatrain,OVRcut, DEPcut,EDWcut,CCcut,DTBcut,LATcut,Hour, Day, Week)
AANaiveDatatest <- select(PredDatatest,OVRcut, DEPcut,EDWcut,CCcut,DTBcut,LATcut,Hour, Day, Week)
AARTDatatrain <- select(PredDatatrain, DEP, EDW, CC, DTB, LAT, Hour, Day, Week, OVR)
AARTDatatest <- select(PredDatatest, DEP, EDW, CC, DTB, LAT, Hour, Day, Week, OVR)
SVM
##SVM
str(AASVMDatatrain)
SVMmodelA <- svm(AASVMDatatrain$OVR ~., data = AASVMDatatrain[1:9])
summary(SVMmodelA)
SVMpreds <- predict(SVMmodelA,AASVMDatatest[1:8])
table(SVMpreds, AASVMDatatest$OVR)
SVMpreds.tuned <-svm(AASVMDatatrain$OVR ~., data = AASVMDatatrain[1:9], kernal='radial', cost =70, gamma=0.2)
SVMpreds2 <- predict(SVMpreds.tuned, AASVMDatatest[1:8])
confusionMatrix(SVMpreds2, AASVMDatatest$OVR)
#need to create a fake one with OVR reversed for ROC chart
SVMROCtrain <- AASVMDatatrain
SVMROCtrain$ROVR <- ifelse(SVMROCtrain$OVR %in% c(1), 0, 1)
SVMROCtest <- AASVMDatatest
SVMROCtest$ROVR <- ifelse(SVMROCtest$OVR %in% c(1), 0, 1)
SVMROCtrain$ROVR <- as.factor(SVMROCtrain$ROVR)
SVMROCtest$ROVR <- as.factor(SVMROCtest$ROVR)
SVMmodelROC <-svm(SVMROCtrain$ROVR ~., data = SVMROCtrain[1:10], kernal='radial', cost =70, gamma=0.2)
SVMpredsROC <- predict(SVMmodelROC, SVMROCtest[1:9])
confusionMatrix(SVMpredsROC, SVMROCtest$ROVR)
Naive Bayes
## Naive
AANaiveDatatrain
Naiveclassy <- naiveBayes(OVRcut ~ DEPcut+EDWcut+CCcut+DTBcut+LATcut+Hour+Day+Week,AANaiveDatatrain)
Naiveclassy
Naivepreds <- predict(Naiveclassy, select(AANaiveDatatest, DEPcut,EDWcut,CCcut,DTBcut,LATcut,Hour, Day, Week), type="raw")
summary(Naivepreds)
AANaiveDatatest$Naive_preds <- ifelse(Naivepreds[,"good"] > 0.70, "good", "bad")
table(AANaiveDatatest$Naive_preds)
NaiveAtestA<-factor(AANaiveDatatest$OVRcut)
NaiveAtestB<-factor(AANaiveDatatest$Naive_preds)
confusionMatrix(NaiveAtestB, NaiveAtestA)
Logistic Regression
##Logistic Regression
AARTDatatrain
AARTDatatest$OVR
logisticreg <- glm(OVR~., family=binomial(link='logit'), data = AARTDatatrain)
LRpreds <- predict(logisticreg,AARTDatatest, type ='response')
LRpreds <- ifelse(LRpreds > 0.5,1,0)
compare <- data.frame(actual = AARTDatatest$OVR, predicted = LRpreds)
misClasificError <- mean(LRpreds != AARTDatatest$OVR)
print(paste('Accuracy',1-misClasificError))
confusionMatrix(as.factor(LRpreds),as.factor(AARTDatatest$OVR))
Random Forest
##Random Forest
AARTDatatrain
RFmodel <- randomForest(OVR ~ ., data=AARTDatatrain, importance = TRUE, ntree = 200, na.action = na.omit)
RFpreds <- predict(RFmodel, AARTDatatest, type='class')
RFoutput <- confusionMatrix(AARTDatatest$OVR, RFpreds)
paste0(RFoutput$overall[1])
#Confusion Matrix
RFoutput
Here are each models confusion matrix
#SVM
SVMROCtrain$ROVR <- as.factor(SVMROCtrain$ROVR)
SVMROCtest$ROVR <- as.factor(SVMROCtest$ROVR)
SVMmodelROC <-svm(SVMROCtrain$ROVR ~., data = SVMROCtrain[1:10], kernal='radial', cost =70, gamma=0.2)
SVMpredsROC <- predict(SVMmodelROC, SVMROCtest[1:9])
confusionMatrix(SVMpredsROC, SVMROCtest$ROVR)
#Naive Bayes
NaiveAtestA<-factor(AANaiveDatatest$OVRcut)
NaiveAtestB<-factor(AANaiveDatatest$Naive_preds)
confusionMatrix(NaiveAtestB, NaiveAtestA)
#Logistic Regression
confusionMatrix(as.factor(LRpreds),as.factor(AARTDatatest$OVR))
#Random Forest
RFoutput <- confusionMatrix(AARTDatatest$OVR, RFpreds)
RFoutput
ROC Curve
##Full comparison ROC Curve
#ROC Linear Regression
LRval <- as.numeric(paste0(LRpreds))
LRpredObj <- prediction(LRval,AARTDatatest$OVR)
LRrocObj <- performance(LRpredObj, measure="tpr", x.measure="fpr")
LRaucObj <- performance(LRpredObj, measure="auc")
#ROC Random Forest
predRFprob <- predict(RFmodel, AARTDatatest, type = "prob")
RFval <- as.numeric(paste0(predRFprob[,2]))
RFpredObj <- prediction(RFval, AARTDatatest$OVR)
RFrocObj <- performance(RFpredObj, measure="tpr", x.measure="fpr")
RFaucObj <- performance(RFpredObj, measure="auc")
#ROC Naive Bayes
predNBprob <- predict(Naiveclassy, AANaiveDatatest, type = "prob")
NBval <- as.numeric(paste0(Naivepreds[,2]))
NBpredObj <- prediction(NBval,AANaiveDatatest$OVR)
NBrocObj <- performance(LRpredObj, measure="tpr", x.measure="fpr")
NBaucObj <- performance(LRpredObj, measure="auc")
#ROC SVM
##Keep working on it for SVM
SVMmodelROC <-svm(SVMROCtrain$ROVR ~., data = SVMROCtrain[1:10], kernal='radial', cost =70, gamma=0.2)
SVMpredsROC <- predict(SVMmodelROC, SVMROCtest[1:9])
SVMrocObj <- roc(response = SVMROCtest$ROVR, predictor =as.numeric(SVMpredsROC))
plot(LRrocObj, col = "blue", lwd = 1, main = "ROC Curves")
plot(RFrocObj, add = TRUE, col = "red")
#plot(SVMrocObj, add = TRUE, col = "green")
plot(NBrocObj, add = TRUE, col = "brown")
abline(a=0, b=1)
(NB) Naive Bayes and (LR) logistic regression have the exact same line (it appears brown but both the brown and blue lines are the same).
This comparison model shows that with this data in relation to emergency department overcrowding the Random Forest model was the best at predicting ED overcrowding.

Leave a comment