.Rhistory

knitr::opts_chunk$set(echo = TRUE)
source("utils/loadpackages.R")
### Read
cred_df <- read_excel("FNB_DATAQUEST_CRED.xlsx",
sheet = "FNB_DATAQUEST_CRED")
# Data Structure
str(cred_df)
nrow(cred_df)
sapply(cred_df, function(x) sum(is.na(x)))
sapply(cred_df, function(x) sum(is.na(x))/nrow(cred_df) * 100)
# Identify Flag indicators - Indicators that have a high % of missing values
# Education level
#table(cred_df$Education_level)
count(cred_df, Education_level = Education_level)
# 95% missing
# Add a value that specifies education is not mentioned
# 0 - Not specified , 1 - HIGHER EDUCATION/DIPLOMA/CERT, 2 - BACHELOR
# 3 - HONOURS, 4 -  MASTERS, 5 - DOCTOR
# Defaulted and not defaulted
#table(cred_df$Default_Ind)
count(cred_df, Customer_default = Default_Ind)
# No missing values
# Target variable
# Number of properties owned
#table(cred_df$NUM_PROP_EVER_OWNED)
count(cred_df, NUM_PROP_EVER_OWNED = NUM_PROP_EVER_OWNED)
# - 1: Missing values
# There is no 0
# Salary Income
#table(cred_df$salary_ind)
count(cred_df, Salary = salary_ind)
# 8.5% Missing values
# 1 means you get your income from a salary, 0 means you receive income from some other source (e.g. own a business) and missing means you get no income
# Pension indicator
#table(cred_df$pens_ind)
count(cred_df, Pensioner = pens_ind)
# 8.5% Missing values
# The number / percentage of missing values is the exact same as salary indicator
# 1 means you are a pensioner, 0 means you are not, missing means we have no income to see if you get a pension or not
#table(cred_df$WKEND_SHOPPER_FLAG_12M)
count(cred_df, Weekend_shopper = WKEND_SHOPPER_FLAG_12M)
# Are you a weekend shopper
# 13% missing values
summary(cred_df)
# Once an individual misses three or more repayments on their loan, then they have defaulted
# FNB wants to keep their probability of default below 10%
# This means that the number of individuals defaulting divided by the total population should be less than 10%
# 1 = They defaulted, 0 = They did not default (This is the target)
cred_df %>%
filter(Default_Ind == 1) %>%
count()/nrow(cred_df)*100
# When will default indicator be less than 10%? : Model factors
# General Answer: Numerator must be as small as possible (1), more customers who did not miss more 3 payments
# Interpretable Statistical model) (To understand factors)
# Denominator - Customer growth (number of customers)
# Numerator - Customers who defaulted
# Small Num - Less default (The smaller the numerator )
# den > num - lower default (Very large den in comparison to num)
# Get more clients/customers that will not default on their low.
# Model: Advise customers on factors that lower the default probability
# Model: Advise customers that defaulted
library(mice)
library(lattice)
library(DataExplorer)
library(reshape2)
plot_histogram(cred_df)
library(PerformanceAnalytics)
cred_pa <- cred_df[, 2:22]
chart.Correlation(cred_pa, histogram=TRUE, pch=19)
cred_cor <- cor(na.omit(cred_df[, 2:22]))
cred_cor
cred_df <- read_excel("FNB_DATAQUEST_CRED.xlsx",
sheet = "FNB_DATAQUEST_CRED")
default_cal <- cred_df %>%
filter(Default_Ind == 1) %>%
count()/nrow(cred_df)*100
round(default_cal$n, 0)
cred_df
str(cred_df)
cred_df$Education_level <- cred_df$Education_level %>% tidyr::replace_na("Not Available/Specified")
cred_df$Default_Ind <- ifelse(grepl(1, cred_df$Default_Ind, ignore.case = T), "Yes", "No")
cred_df$salary_ind <- ifelse(grepl(1, cred_df$salary_ind, ignore.case = T), "Salary",
ifelse(grepl(0, cred_df$salary_ind, ignore.case = T), "Other/Own",
"No Income"))
cred_df$pens_ind <- ifelse(grepl(1, cred_df$pens_ind, ignore.case = T), "Yes",
ifelse(grepl(0, cred_df$pens_ind, ignore.case = T), "No",
"No Income to Verify"))
cred_df$WKEND_SHOPPER_FLAG_12M <- ifelse(grepl(1, cred_df$WKEND_SHOPPER_FLAG_12M, ignore.case = T),
"Yes",
ifelse(grepl(0, cred_df$WKEND_SHOPPER_FLAG_12M, ignore.case = T),
"No", "Not Available/Specified"))
names(cred_df)[names(cred_df) == "Education_level"] <- "Customer's highest education level"
names(cred_df)[names(cred_df) == "CUST_AGE"] <- "Customer's age"
names(cred_df)[names(cred_df) == "Income"] <- "Customer's Monthly Income in Rands"
names(cred_df)[names(cred_df) == "Default_Ind"] <- "Did Customer miss 3 payments on their loans"
names(cred_df)[names(cred_df) == "Unsecured_Utilisation"] <- "Unsecured credit limit used by customer"
names(cred_df)[names(cred_df) == "DDA_RTN_ITM_L2M_NR"] <- "Number of debit orders returned on customer cheque account due to insufficient funds"
names(cred_df)[names(cred_df) == "total_liquidity"] <- "Total amount in Rands in customer's disposal (savings/investments/stokvels etc)."
names(cred_df)[names(cred_df) == "NUM_PROP_EVER_OWNED"] <- "Number of properties a customer has ever owned"
names(cred_df)[names(cred_df) == "WSTATUS_L90D_OPNRU_AL"] <- "Number of loan repayments a customer missed over the last 90 days"
names(cred_df)[names(cred_df) == "NR_TRD_USAL"] <- "Number of usecured short-term loans a customer have across all credit providers"
names(cred_df)[names(cred_df) == "NR_TRD_OPNRU_ULAL"] <- "Number of usecured long-term loans a customer have across all credit providers"
names(cred_df)[names(cred_df) == "Household_credit_score"] <- "Customer's household credit score"
names(cred_df)[names(cred_df) == "RTI"] <- "Customer's total loan repayments to income ratio"
names(cred_df)[names(cred_df) == "salary_ind"] <- "Customer's source of income"
names(cred_df)[names(cred_df) == "pens_ind"] <- "Is the customer a pensioner"
names(cred_df)[names(cred_df) == "TOTAL_CT_12M"] <- "Customer's number of spend transactions over last 12 months"
names(cred_df)[names(cred_df) == "TOTAL_SP_12M"] <- "Customer's total spend in Rands over last 12 months"
names(cred_df)[names(cred_df) == "CREDIT_CARD_TOT_CT_12M"] <- "Customer's total number of credit card transactions over last 12 months"
names(cred_df)[names(cred_df) == "C1M_TOT_COUNT"] <- "Customers change in number of spend transactions over 1 month"
names(cred_df)[names(cred_df) == "C1M_TOT_SPEND"] <- "Customers change in total spend in Rands over 1 month"
names(cred_df)[names(cred_df) == "WKEND_SHOPPER_FLAG_12M"] <- "Is the customer a weekend shopper"
comdata <- cred_df %>%
group_by(`Customer's highest education level`, `Did Customer miss 3 payments on their loans`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Customer's highest education level`, -n),
y = percent, fill = `Did Customer miss 3 payments on their loans`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.2) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Defaulted") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
## Variable to use in shiny dashboard
## Comparison filter in shiny dashboard
cred_df[, c(1,14,15,21)]
# Remove missing values
cred_nna <- na.omit(cred_df)
dplot <- cred_nna %>%
ggplot(aes(x = `Customer's total spend in Rands over last 12 months`, fill = `Did Customer miss 3 payments on their loans`)) +
geom_density(color="#e9ecef", alpha=0.9, position = 'identity') +
scale_fill_manual(values=c("green", "red")) +
labs(fill = "Defaulted",
y = "Number of Customers") +
theme_minimal()
ggplotly(dplot)
cred_df[, -c(1,14,15,21, 22)]
sapply(cred_nna, function(x) sum(is.na(x)))
summary(cred_nna)
fig <- cred_nna %>%
plot_ly(
x = ~`Did Customer miss 3 payments on their loans`,
y = ~`Customers change in number of spend transactions over 1 month`,
split = ~`Did Customer miss 3 payments on their loans`,
type = 'violin',box = list(visible = T), meanline = list(visible = T),
colors = c("green", "red"), color = ~`Did Customer miss 3 payments on their loans`)  %>%
layout(
xaxis = list(title = "Defaulted"),
yaxis = list(title = "",zeroline = F))
fig
rplot <- cred_nna %>%
ggplot(aes(x=`Customer's age`, y = `Customers change in number of spend transactions over 1 month`,
color = `Did Customer miss 3 payments on their loans`)) +
scale_color_manual(values=c("green", "red")) +
geom_point(size=2) +
labs(color = "Default",
y = "",
x = "") +
theme_minimal()
ggplotly(rplot)
cred_df
colnames(cred_df)
knitr::opts_chunk$set(echo = TRUE)
source("utils/loadpackages.R")
source("utils/loadpackages.R")
source("utils/loadpackages.R")
# Read data
hd_df <- read.csv("Heart_Data.csv")
hd_df
hd_df[1:ncol(hd_df)]
hd_df[,1:ncol(hd_df)]
hd_df[,2:ncol(hd_df)]
# Removing the first column
hd_df <- hd_df[,2:ncol(hd_df)]
hd_df
# Renaming columns
names(hd_df)[names(hd_df) == "age"] <- "Age"
names(hd_df)[names(hd_df) == "sex"] <- "Sex"
names(hd_df)[names(hd_df) == "cp"] <- "Chest pain type"
names(hd_df)[names(hd_df) == "trestbps"] <- "Resting blood pressure (in mm Hg on admission to the hospital)"
names(hd_df)[names(hd_df) == "chol"] <- "Serum cholestoral in mg/dl"
names(hd_df)[names(hd_df) == "fbs"] <- "Fasting blood sugar > 120 mg/dl"
names(hd_df)[names(hd_df) == "restecg"] <- "Resting electrocardiography results"
names(hd_df)[names(hd_df) == "thalach"] <- "Maximum heart rate achieved"
names(hd_df)[names(hd_df) == "exang"] <- "Exercise induced angina"
names(hd_df)[names(hd_df) == "oldpeak"] <- "ST depression induced by exercise relative to rest"
names(hd_df)[names(hd_df) == "slope"] <- "The slope of the peak exercise ST segment"
names(hd_df)[names(hd_df) == "ca"] <- "Number of major vessels colored by fluoroscopic"
names(hd_df)[names(hd_df) == "thal"] <- "Status of the heart"
names(hd_df)[names(hd_df) == "target"] <- "Heart Disease"
hd_df
names(hd_df)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.2) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Defaulted") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl)) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Defaulted") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl)) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.2) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.1) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.4) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = 0.1) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = -0.1) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
comdata <- hd_df %>%
group_by(`Chest pain type`, `Heart Disease`) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n),
lbl = scales::percent(percent))
cplot <- comdata %>%
ggplot(aes(x = reorder(`Chest pain type`, -n),
y = percent, fill = `Heart Disease`)) +
geom_bar(stat = "identity",
position = "dodge") +
geom_text(aes(label = lbl),
vjust = -0.25,
nudge_x = -0.15) +
scale_y_continuous(labels = percent) +
labs(x = "", y = "",
title = "",
fill = "Heart Disease") +
scale_fill_manual(values = c("green", "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(cplot)
fig <- hd_df %>%
plot_ly(
x = ~`Heart Disease`,
y = ~`Age`,
split = ~`Heart Disease`,
type = 'violin',box = list(visible = T), meanline = list(visible = T),
colors = c("green", "red"), color = ~`Heart Disease`)  %>%
layout(
xaxis = list(title = "Heart Disease"),
yaxis = list(title = "",zeroline = F))
fig
rplot <- hd_df %>%
ggplot(aes(x=`Age`, y = `Serum cholestoral in mg/dl`,
color = `Heart Disease`)) +
scale_color_manual(values=c("green", "red")) +
geom_point(size=2) +
labs(color = "Heart Disease",
y = "",
x = "") +
theme_minimal()
ggplotly(rplot)
# % of default
heart_cal <- hd_df %>%
filter(`Heart Disease` == "Yes") %>%
count()/nrow(hd_df)*100
heart_cal
names(hd_df)
hd_df
library(shinydashboard)