# Load of libraries
library(tidyverse)
library(zoo)
library(corrplot)
library(MASS)
library(reshape2)
Recently, I discovered a new website about competitions that it is not called Kaggle! Its name is Drivendata.
DrivenData offers different competitions related with multiple types of field, such as health (oh yes!), ecology, society… with a common element: to face the world’s biggest social challenges.
I decided to join my first competition called ‘DengAI: Predicting Disease Spread’. In this case, the user receives a set of weather information (temperatures, precipitations, vegetations) from two cities: San Juan (Puerto Rico) and Iquitos (Peru) with total cases of dengue by year and week of every year.
The goal of the competition is to develop a prediction model that would be able to anticipate the cases of dengue in every city depending on a set of climate variables.
The DrivenData’s blog wrote some days ago, a post about a fast approach with this dataset. It was written in Python. So, I decided to “translate” to R language.
The next code is divided into three main points:
1. Code with clean tasks (transform NA values, remove of columns…) and exploratory analyses.
2. Function with every step during cleaning of data.
3. Development of model, prediction and comparison of predicted vs real total cases detected.
# Load data
<- read.csv('data/dengue_features_train.csv')
train_features
<- read.csv('data/dengue_labels_train.csv')
train_labels <- read.csv('data/dengue_features_test.csv')
test_features <- read.csv('data/submission_format.csv')
submission_format
# Filter of data by city
<- filter(train_labels, city == 'sj')
sj_train_labels <- filter(train_features, city == 'sj')
sj_train_features <- filter(train_labels, city == 'iq')
iq_train_labels <- filter(train_features, city == 'iq') iq_train_features
# Is there NA values?
<- as.data.frame(apply(sj_train_features,2, function(x) any(is.na(x))))
df_na_sj colnames(df_na_sj) <- 'is_there_NA'
$number_NA <- apply(sj_train_features,2, function(x) sum(is.na(x)))
df_na_sj$mean_NA <- apply(sj_train_features, 2, function(x) mean(is.na(x)))
df_na_sj<- as.data.frame(apply(iq_train_features, 2, function(x) any(is.na(x))))
df_na_iq colnames(df_na_iq) <- 'is_there_NA'
$number_NA <- apply(iq_train_features, 2, function(x) sum(is.na(x)))
df_na_iq$mean_NA <- apply(iq_train_features, 2, function(x) mean(is.na(x))) df_na_iq
# Vegetation Index over Time Plot with NAs
ggplot(sj_train_features, aes(x = as.Date(week_start_date), y = ndvi_ne )) +
ggtitle('Vegetation Index over Time') +
theme_bw() +
xlab('Title') +
geom_line(na.rm = FALSE, color = 'blue') +
theme(plot.title = element_text(hjust = 0.5))
# Remove 'weekofyear' column
<- dplyr::select(sj_train_features, -week_start_date)
sj_train_features <- dplyr::select(iq_train_features, -week_start_date)
iq_train_features # Fill the NA values with the previous value
<- sj_train_features %>%
sj_train_features do(na.locf(.))
<- iq_train_features %>%
iq_train_features do(na.locf(.))
# Distribution of labels
# print(mean(sj_train_labels$total_cases))
# print(var(sj_train_labels$total_cases))
#
# print(mean(iq_train_labels$total_cases))
# print(var(iq_train_labels$total_cases))
ggplot(sj_train_labels, aes(x = total_cases)) +
theme_bw() +
ggtitle('Cases of dengue in San Juan') +
geom_histogram() +
theme(plot.title = element_text(hjust = 0.5))
ggplot(iq_train_labels, aes(x = total_cases)) +
theme_bw() +
ggtitle('Cases of dengue in Iquitos') +
geom_histogram() +
theme(plot.title = element_text(hjust = 0.5))
# Add total_cases column to *_train_features dataframes
# sj_train_features <- left_join(sj_train_features, sj_train_labels, by = c('city', 'year', 'weekofyear'))
$total_cases <- sj_train_labels$total_cases
sj_train_features# iq_train_features <- left_join(iq_train_features, iq_train_labels, by = c('city', 'year', 'weekofyear'))
$total_cases <- iq_train_labels$total_cases
iq_train_features# Correlation matrix
<- data.matrix(sj_train_features)
m_sj_train_features <- cor(x = m_sj_train_features[,3:24], use = 'complete.obs', method = 'pearson')
m_sj_train_features <- data.matrix(iq_train_features)
m_iq_train_features <- cor(x = m_iq_train_features[,3:24], use = 'everything', method = 'pearson')
m_iq_train_features # Correlation Heatmap
corrplot(m_sj_train_features, type = 'full', tl.col = 'black', method="shade")
corrplot(m_iq_train_features, type = 'full', tl.col = 'black', method = 'shade')
# Correlation Bar plot
<- data.frame(m_sj_train_features)[2:21,]
df_sj_train_features <- dplyr::select(df_sj_train_features, total_cases)
df_sj_train_features
<- data.frame(m_iq_train_features)[2:21,]
df_iq_train_features <- dplyr::select(df_iq_train_features, total_cases)
df_iq_train_features ggplot(df_sj_train_features, aes(x= reorder(rownames(df_sj_train_features), -total_cases), y = total_cases)) +
geom_bar(stat = 'identity') +
theme_bw() +
ggtitle('Correlation of variables in San Juan') +
ylab('Correlation') +
xlab('Variables') +
coord_flip()
ggplot(df_iq_train_features, aes(x= reorder(rownames(df_sj_train_features), -total_cases), y = total_cases)) +
geom_bar(stat = 'identity') +
theme_bw() +
ggtitle('Correlation of variables in Iquitos') +
ylab('Correlation') +
xlab('Variables') +
coord_flip()
# Function data cleaning
<- function(df_dengue_features, df_dengue_labels = NULL, add_cases = TRUE) {
data_clean
# Filter by city
<- filter(df_dengue_features, city == 'sj')
sj_df_dengue_features <- filter(df_dengue_features, city == 'iq')
iq_df_dengue_features
if (add_cases == TRUE) {
<- filter(df_dengue_labels, city == 'sj')
sj_df_dengue_labels <- filter(df_dengue_labels, city == 'iq')
iq_df_dengue_labels
}# Removing week_start_date column
<- dplyr::select(sj_df_dengue_features, -week_start_date)
sj_df_dengue_features <- dplyr::select(iq_df_dengue_features, -week_start_date)
iq_df_dengue_features # Fill of NA values with the previous value
<- sj_df_dengue_features %>%
sj_df_dengue_features do(na.locf(.))
<- iq_df_dengue_features %>%
iq_df_dengue_features do(na.locf(.))
# Add total_cases to dataframe with features
if (add_cases == TRUE) {
$total_cases <- sj_df_dengue_labels$total_cases
sj_df_dengue_features$total_cases <- iq_df_dengue_labels$total_cases
iq_df_dengue_features
}
# Converting character columns into numbers
<- as.data.frame(apply(sj_df_dengue_features,2,as.numeric))
sj_df_dengue_features $city <- rep('sj', nrow(sj_df_dengue_features))
sj_df_dengue_features<- as.data.frame(apply(iq_df_dengue_features,2,as.numeric))
iq_df_dengue_features $city <- rep('iq', nrow(iq_df_dengue_features))
iq_df_dengue_features
<- list(sj_df_dengue_features, iq_df_dengue_features )
result
return(result)
}
# Getting data_training clean
<- data_clean(train_features, train_labels, TRUE)
data_train # Getting negative binomials models by city
<- glm.nb(formula = total_cases ~ reanalysis_specific_humidity_g_per_kg +
training_sj +
reanalysis_dew_point_temp_k +
station_min_temp_c data = data_train[[1]])
station_avg_temp_c, <- glm.nb(formula = total_cases ~ reanalysis_specific_humidity_g_per_kg +
training_iq +
reanalysis_dew_point_temp_k +
station_min_temp_c data = data_train[[2]])
station_avg_temp_c, # Getting data_test clean
<- data_clean(test_features, add_cases = FALSE)
data_test # Testing model with training data
<- predict(training_sj, data_train[[1]], type = 'response')
prediction_train_sj <- predict(training_iq, data_train[[2]], type = 'response')
prediction_train_iq <- data.frame('prediction' = prediction_train_sj, 'actual' = data_train[[1]]$total_cases,
df_prediction_train_sj 'time' = as.Date(train_features$week_start_date[1:936]))
<- melt(df_prediction_train_sj, id.vars = 'time')
df_prediction_train_sj ggplot(df_prediction_train_sj, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('Dengue predicted Cases vs. Actual Cases (City-San Juan) ')
<- data.frame('prediction' = prediction_train_iq, 'actual' = data_train[[2]]$total_cases,
df_prediction_train_iq 'time' = as.Date(train_features$week_start_date[937:1456]))
<- melt(df_prediction_train_iq, id.vars = 'time')
df_prediction_train_iq ggplot(df_prediction_train_iq, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('Dengue predicted Cases vs. Actual Cases (City-Iquitos) ')
# Prediction of total_cases in the data set
<- predict(training_sj, data_test[[1]], type = 'response')
prediction_sj <- predict(training_iq, data_test[[2]], type = 'response')
prediction_iq
<- data.frame('city' = rep('sj', length(prediction_sj) ),
data_prediction_sj 'total_cases' = prediction_sj,
'weekofyear' = data_test[[1]]$weekofyear,
'year' = data_test[[1]]$year )
<- data.frame('city' = rep('iq', length(prediction_iq) ),
data_prediction_iq 'total_cases' = prediction_iq,
'weekofyear' = data_test[[2]]$weekofyear,
'year' = data_test[[2]]$year)
$total_cases <- as.numeric(c(data_prediction_sj$total_cases,
submission_format$total_cases))
data_prediction_iq$total_cases <- round(submission_format$total_cases, 0)
submission_format
write.csv(submission_format,
file = 'submission_format_submit.csv', row.names = F)