# Load all required libraries upfront
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)
library(stats)
library(pROC)
library(randomForest)
library(ranger)
library(caret)
# Load dataset
# Place card_transdata.csv in your working directory before running
# Dataset: https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud
df <- read_csv("card_transdata.csv", show_col_types = FALSE)
cat("Dimensions:", dim(df), "\n")
## Dimensions: 1000000 8
# Column summary: data types, missing values, and averages
summary_table <- df %>%
summarise(
Column_Name = names(.),
Data_Type = sapply(., class),
Missing_Values = sapply(., function(col) sum(is.na(col))),
Average = sapply(., function(col) if (is.numeric(col)) round(mean(col, na.rm = TRUE), 4) else NA)
) %>%
as.data.frame()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
kable(summary_table, caption = "Column Summary: Data Type, Missing Values, and Averages")
| Column_Name | Data_Type | Missing_Values | Average |
|---|---|---|---|
| distance_from_home | numeric | 0 | 26.6288 |
| distance_from_last_transaction | numeric | 0 | 5.0365 |
| ratio_to_median_purchase_price | numeric | 0 | 1.8242 |
| repeat_retailer | numeric | 0 | 0.8815 |
| used_chip | numeric | 0 | 0.3504 |
| used_pin_number | numeric | 0 | 0.1006 |
| online_order | numeric | 0 | 0.6506 |
| fraud | numeric | 0 | 0.0874 |
# Full statistical summary of all variables
summary(df)
## distance_from_home distance_from_last_transaction
## Min. : 0.005 Min. : 0.000
## 1st Qu.: 3.878 1st Qu.: 0.297
## Median : 9.968 Median : 0.999
## Mean : 26.629 Mean : 5.037
## 3rd Qu.: 25.744 3rd Qu.: 3.356
## Max. :10632.724 Max. :11851.105
## ratio_to_median_purchase_price repeat_retailer used_chip
## Min. : 0.0044 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.4757 1st Qu.:1.0000 1st Qu.:0.0000
## Median : 0.9977 Median :1.0000 Median :0.0000
## Mean : 1.8242 Mean :0.8815 Mean :0.3504
## 3rd Qu.: 2.0964 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :267.8029 Max. :1.0000 Max. :1.0000
## used_pin_number online_order fraud
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.1006 Mean :0.6506 Mean :0.0874
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# The target variable (fraud) is heavily imbalanced at ~8.74% fraud rate.
# This needs to be addressed before modeling to avoid biased predictions.
fraud_counts <- table(df$fraud)
barplot(fraud_counts,
main = "Fraud Count Distribution",
xlab = "Fraud Indicator",
ylab = "Number of Transactions",
col = c("green", "red"),
names.arg = c("No Fraud", "Fraud"))
cat("Fraud rate:", round(mean(df$fraud) * 100, 2), "%\n")
## Fraud rate: 8.74 %
# Before modeling, we examine fraud rates across binary features.
# This helps us understand which variables carry the most predictive signal.
binary_vars <- c("repeat_retailer", "used_chip", "used_pin_number", "online_order")
fraud_rate_table <- do.call(rbind, lapply(binary_vars, function(var) {
df %>%
group_by(Value = .data[[var]]) %>%
summarise(Fraud_Rate = round(mean(fraud) * 100, 2), .groups = "drop") %>%
mutate(Variable = var) %>%
select(Variable, Value, Fraud_Rate)
}))
kable(fraud_rate_table, caption = "Fraud Rate by Binary Variable")
| Variable | Value | Fraud_Rate |
|---|---|---|
| repeat_retailer | 0 | 8.84 |
| repeat_retailer | 1 | 8.73 |
| used_chip | 0 | 10.01 |
| used_chip | 1 | 6.40 |
| used_pin_number | 0 | 9.69 |
| used_pin_number | 1 | 0.27 |
| online_order | 0 | 1.34 |
| online_order | 1 | 12.71 |
Key findings: - used_pin_number is the strongest signal — fraud rate drops from 9.7% to just 0.27% when PIN is used - online_order nearly 10x the fraud rate compared to in-person transactions (12.7% vs 1.3%) - repeat_retailer shows almost no difference — motivating the interaction term tested later
# Raw distributions — all three variables are heavily right-skewed
# with extreme outliers, making log transformation necessary
variables <- c("distance_from_home", "distance_from_last_transaction", "ratio_to_median_purchase_price")
par(mfrow = c(1, 3), mar = c(5, 4, 4, 2))
for (var in variables) {
hist(df[[var]],
main = paste("Raw:", var),
xlab = var,
col = "grey",
border = "black",
breaks = 25)
}
par(mfrow = c(1, 1))
# After log + sqrt transformation, distributions are more symmetric
# and suitable for modeling
par(mfrow = c(1, 3), mar = c(5, 4, 4, 2))
for (var in variables) {
log_var <- (log(df[[var]] + 1))^(0.5)
hist(log_var,
main = paste("Transformed:", var),
xlab = var,
col = "grey",
border = "black",
breaks = 25)
}
par(mfrow = c(1, 1))
# Box plots comparing fraud vs non-fraud across continuous variables.
# Clear separation confirms these variables are meaningful predictors.
aliases <- c(
"distance_from_home" = "Distance from Home",
"distance_from_last_transaction" = "Distance from Last Transaction",
"ratio_to_median_purchase_price" = "Ratio to Median Purchase Price"
)
for (var in variables) {
p <- ggplot(df, aes(x = factor(fraud, levels = c(0, 1), labels = c("No", "Yes")),
y = log(df[[var]] + 1)^(0.5), fill = factor(fraud))) +
geom_boxplot(outlier.shape = NA) +
labs(
title = paste("Fraud Status Comparison:", aliases[var]),
x = "Fraud Status",
y = paste("Transformed", aliases[var])
) +
scale_fill_manual(values = c("green", "red"), name = "Fraud Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
print(p)
}
# Mosaic plots show the proportion of fraud within each binary category.
# Larger red areas indicate higher fraud concentration.
aliases_binary <- c(
"repeat_retailer" = "Repeat Retailer",
"used_chip" = "Used Chip",
"used_pin_number" = "Used PIN",
"online_order" = "Online Order"
)
par(mfrow = c(2, 2), mar = c(5, 4, 4, 2))
for (var in binary_vars) {
contingency_table <- table(df[[var]], df$fraud)
mosaicplot(
contingency_table,
main = paste("Mosaic:", aliases_binary[var], "vs Fraud"),
xlab = aliases_binary[var],
ylab = "Fraud",
color = TRUE
)
}
par(mfrow = c(1, 1))
# Check for multicollinearity between predictors.
# High correlation between distance_from_home and repeat_retailer
# motivates testing an interaction term in the logistic regression.
numeric_cols <- sapply(df, is.numeric)
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")
print(round(cor_matrix, 3))
## distance_from_home
## distance_from_home 1.000
## distance_from_last_transaction 0.000
## ratio_to_median_purchase_price -0.001
## repeat_retailer 0.143
## used_chip -0.001
## used_pin_number -0.002
## online_order -0.001
## fraud 0.188
## distance_from_last_transaction
## distance_from_home 0.000
## distance_from_last_transaction 1.000
## ratio_to_median_purchase_price 0.001
## repeat_retailer -0.001
## used_chip 0.002
## used_pin_number -0.001
## online_order 0.000
## fraud 0.092
## ratio_to_median_purchase_price repeat_retailer
## distance_from_home -0.001 0.143
## distance_from_last_transaction 0.001 -0.001
## ratio_to_median_purchase_price 1.000 0.001
## repeat_retailer 0.001 1.000
## used_chip 0.001 -0.001
## used_pin_number 0.001 0.000
## online_order 0.000 -0.001
## fraud 0.462 -0.001
## used_chip used_pin_number online_order fraud
## distance_from_home -0.001 -0.002 -0.001 0.188
## distance_from_last_transaction 0.002 -0.001 0.000 0.092
## ratio_to_median_purchase_price 0.001 0.001 0.000 0.462
## repeat_retailer -0.001 0.000 -0.001 -0.001
## used_chip 1.000 -0.001 0.000 -0.061
## used_pin_number -0.001 1.000 0.000 -0.100
## online_order 0.000 0.000 1.000 0.192
## fraud -0.061 -0.100 0.192 1.000
# Visualizing distance_from_home vs repeat_retailer to assess
# whether an interaction term is warranted in the model
p <- ggplot(df, aes(x = factor(repeat_retailer, levels = c(0, 1), labels = c("No", "Yes")),
y = log(df$distance_from_home)^(0.5), fill = factor(repeat_retailer))) +
geom_boxplot(outlier.shape = NA) +
labs(
title = "Distance from Home vs Repeat Retailer",
x = "Repeat Retailer",
y = "Transformed Distance from Home"
) +
scale_fill_manual(values = c("lightblue", "lightgreen"), name = "Repeat Retailer") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
print(p)
## Warning: Removed 50181 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# The dataset is imbalanced (8.74% fraud). Training on imbalanced data causes
# the model to predict the majority class almost exclusively.
# We use bootstrapping (oversampling with replacement) to balance the classes.
# Check original distribution
cat("Original class distribution:\n")
## Original class distribution:
print(table(df$fraud))
##
## 0 1
## 912597 87403
# Separate minority and majority classes
minority <- df[df$fraud == 1, ]
majority <- df[df$fraud == 0, ]
# Bootstrap minority class to match majority size
set.seed(123)
minority_boot <- minority[sample(1:nrow(minority), size = nrow(majority), replace = TRUE), ]
# Combine into balanced dataset
balanced_df <- rbind(minority_boot, majority)
cat("\nBalanced class distribution:\n")
##
## Balanced class distribution:
print(table(balanced_df$fraud))
##
## 0 1
## 912597 912597
# Confirm balance visually
fraud_counts_balanced <- table(balanced_df$fraud)
barplot(fraud_counts_balanced,
main = "Fraud Distribution After Bootstrapping",
xlab = "Fraud Indicator",
ylab = "Number of Transactions",
col = c("green", "red"),
names.arg = c("No Fraud", "Fraud"))
# 70/30 train-test split on the balanced dataset
# set.seed ensures reproducibility
set.seed(123)
train_ratio <- 0.7
train_indices <- sample(1:nrow(balanced_df), size = floor(train_ratio * nrow(balanced_df)))
train_data <- balanced_df[train_indices, ]
test_data <- balanced_df[-train_indices, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 1277635
cat("Testing set size: ", nrow(test_data), "\n")
## Testing set size: 547559
cat("\nTraining class distribution:\n")
##
## Training class distribution:
print(table(train_data$fraud))
##
## 0 1
## 638833 638802
cat("\nTesting class distribution:\n")
##
## Testing class distribution:
print(table(test_data$fraud))
##
## 0 1
## 273764 273795
# Start with all predictors and use AIC-based backward elimination
# to identify the most informative subset of features.
full_model <- glm(fraud ~ ., data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
best_model <- step(full_model, direction = "backward")
## Start: AIC=531319.7
## fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 531304 531320
## - repeat_retailer 1 547714 547728
## - used_chip 1 553652 553666
## - used_pin_number 1 608215 608229
## - distance_from_last_transaction 1 650006 650020
## - online_order 1 714796 714810
## - distance_from_home 1 808774 808788
## - ratio_to_median_purchase_price 1 1331140 1331154
summary(best_model)
##
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.662e+00 1.987e-02 -385.6 <2e-16 ***
## distance_from_home 2.891e-02 8.214e-05 352.0 <2e-16 ***
## distance_from_last_transaction 4.977e-02 2.085e-04 238.7 <2e-16 ***
## ratio_to_median_purchase_price 1.212e+00 2.469e-03 490.7 <2e-16 ***
## repeat_retailer -1.422e+00 1.159e-02 -122.7 <2e-16 ***
## used_chip -1.200e+00 8.354e-03 -143.6 <2e-16 ***
## used_pin_number -9.992e+00 5.845e-02 -170.9 <2e-16 ***
## online_order 5.027e+00 1.586e-02 316.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1771178 on 1277634 degrees of freedom
## Residual deviance: 531304 on 1277627 degrees of freedom
## AIC: 531320
##
## Number of Fisher Scoring iterations: 10
# Fit logistic regression with all predictors selected by backward elimination.
# This serves as the baseline before testing interaction terms.
model <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
ratio_to_median_purchase_price + repeat_retailer + used_chip +
used_pin_number + online_order,
data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.662e+00 1.987e-02 -385.6 <2e-16 ***
## distance_from_home 2.891e-02 8.214e-05 352.0 <2e-16 ***
## distance_from_last_transaction 4.977e-02 2.085e-04 238.7 <2e-16 ***
## ratio_to_median_purchase_price 1.212e+00 2.469e-03 490.7 <2e-16 ***
## repeat_retailer -1.422e+00 1.159e-02 -122.7 <2e-16 ***
## used_chip -1.200e+00 8.354e-03 -143.6 <2e-16 ***
## used_pin_number -9.992e+00 5.845e-02 -170.9 <2e-16 ***
## online_order 5.027e+00 1.586e-02 316.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1771178 on 1277634 degrees of freedom
## Residual deviance: 531304 on 1277627 degrees of freedom
## AIC: 531320
##
## Number of Fisher Scoring iterations: 10
# Predict on training data
predicted_probs <- predict(model, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
cat("\nTraining Confusion Matrix:\n")
##
## Training Confusion Matrix:
print(table(Predicted = predicted_classes, Actual = train_data$fraud))
## Actual
## Predicted 0 1
## 0 596010 32468
## 1 42823 606334
# Fitted probability plot — well-separated distributions indicate good model fit
ggplot(train_data, aes(x = predicted_probs, fill = factor(fraud))) +
geom_density(alpha = 0.5) +
labs(title = "Fitted Probability Plot (Base Model)",
x = "Predicted Probability",
fill = "Fraud (0 = No, 1 = Yes)") +
theme_minimal()
# EDA showed correlation between distance_from_home and repeat_retailer.
# We test whether their interaction improves model fit.
model2 <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
ratio_to_median_purchase_price + repeat_retailer + used_chip +
used_pin_number + online_order + (distance_from_home * repeat_retailer),
data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model2)
##
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order + (distance_from_home * repeat_retailer),
## family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.5099682 0.0295355 -254.269 < 2e-16 ***
## distance_from_home -0.1114567 0.0203595 -5.474 4.39e-08 ***
## distance_from_last_transaction 0.0497784 0.0002085 238.711 < 2e-16 ***
## ratio_to_median_purchase_price 1.2118307 0.0024695 490.726 < 2e-16 ***
## repeat_retailer -1.5758123 0.0250949 -62.794 < 2e-16 ***
## used_chip -1.1998418 0.0083543 -143.620 < 2e-16 ***
## used_pin_number -9.9935244 0.0584509 -170.973 < 2e-16 ***
## online_order 5.0282505 0.0158635 316.971 < 2e-16 ***
## distance_from_home:repeat_retailer 0.1403734 0.0203603 6.894 5.41e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1771178 on 1277634 degrees of freedom
## Residual deviance: 531252 on 1277626 degrees of freedom
## AIC: 531270
##
## Number of Fisher Scoring iterations: 9
predicted_probs_m2 <- predict(model2, type = "response")
predicted_classes_m2 <- ifelse(predicted_probs_m2 > 0.5, 1, 0)
cat("\nTraining Confusion Matrix (Interaction Model):\n")
##
## Training Confusion Matrix (Interaction Model):
print(table(Predicted = predicted_classes_m2, Actual = train_data$fraud))
## Actual
## Predicted 0 1
## 0 596017 32434
## 1 42816 606368
ggplot(train_data, aes(x = predicted_probs_m2, fill = factor(fraud))) +
geom_density(alpha = 0.5) +
labs(title = "Fitted Probability Plot (Interaction Model)",
x = "Predicted Probability",
fill = "Fraud (0 = No, 1 = Yes)") +
theme_minimal()
# Likelihood Ratio Test compares the two models.
# A significant p-value means the interaction term adds meaningful explanatory power.
model_no_interaction <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
ratio_to_median_purchase_price + repeat_retailer + used_chip +
used_pin_number + online_order,
family = "binomial", data = balanced_df)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_interaction <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
ratio_to_median_purchase_price + repeat_retailer + used_chip +
used_pin_number + online_order + (distance_from_home * repeat_retailer),
family = "binomial", data = balanced_df)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
anova(model_no_interaction, model_interaction, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order
## Model 2: fraud ~ distance_from_home + distance_from_last_transaction +
## ratio_to_median_purchase_price + repeat_retailer + used_chip +
## used_pin_number + online_order + (distance_from_home * repeat_retailer)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1825186 759376
## 2 1825185 759287 1 88.875 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Evaluate the base model on held-out test data.
# Precision and recall are more informative than accuracy for imbalanced fraud data.
test_probs <- predict(model, newdata = test_data, type = "response")
test_preds <- ifelse(test_probs > 0.5, 1, 0)
conf_matrix <- table(Predicted = test_preds, Actual = test_data$fraud)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 255145 14080
## 1 18619 259715
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
recall <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Accuracy: ", round(accuracy, 4), "\n")
## Accuracy: 0.9403
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.9331
cat("Recall: ", round(recall, 4), "\n")
## Recall: 0.9486
cat("F1-Score: ", round(f1_score, 4), "\n")
## F1-Score: 0.9408
# ROC curve and AUC measure discrimination ability across all thresholds.
# AUC of 1.0 = perfect, 0.5 = random chance.
roc_obj <- roc(test_data$fraud, test_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- auc(roc_obj)
plot(roc_obj, main = paste("ROC Curve (AUC =", round(auc_val, 3), ")"))
cat("AUC:", round(auc_val, 3), "\n")
## AUC: 0.94
# Random Forest is an ensemble method that builds many decision trees and
# aggregates their predictions. It handles non-linear relationships and
# interactions naturally, making it well-suited for fraud detection.
# Convert fraud to factor for classification
df$fraud <- as.factor(df$fraud)
# Train/test split on original (unbalanced) df for RF
# RF handles class imbalance better than logistic regression
set.seed(123)
train_index_rf <- createDataPartition(df$fraud, p = 0.7, list = FALSE)
train_data_rf <- df[train_index_rf, ]
test_data_rf <- df[-train_index_rf, ]
# Fit RF with ranger (memory-efficient implementation)
# num.trees=200 balances accuracy and compute time
# mtry=3 = number of variables tried at each split
rf_model <- ranger(fraud ~ ., data = train_data_rf,
num.trees = 200, mtry = 3, importance = "impurity")
print(rf_model)
## Ranger result
##
## Call:
## ranger(fraud ~ ., data = train_data_rf, num.trees = 200, mtry = 3, importance = "impurity")
##
## Type: Classification
## Number of trees: 200
## Sample size: 700001
## Number of independent variables: 7
## Mtry: 3
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.00 %
# Predict on test data
rf_predictions <- predict(rf_model, data = test_data_rf, type = "response")
rf_predicted_classes <- rf_predictions$predictions
# Confusion matrix
confusionMatrix(as.factor(rf_predicted_classes), test_data_rf$fraud)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 273779 5
## 1 0 26215
##
## Accuracy : 1
## 95% CI : (1, 1)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9999
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9998
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.9126
## Detection Rate : 0.9126
## Detection Prevalence : 0.9126
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
# Note: Near-perfect accuracy is expected here but warrants cross-validation
# to rule out overfitting
# Feature importance shows which variables drive predictions the most.
# Higher impurity reduction = more important variable.
importance <- rf_model$variable.importance
sorted_importance <- sort(importance, decreasing = TRUE)
par(mar = c(5, 11, 8, 2))
barplot(
sorted_importance,
main = "Random Forest Feature Importance",
horiz = TRUE,
las = 1,
col = "steelblue",
xlab = "Importance Score",
names.arg = names(sorted_importance),
cex.names = 0.8
)
par(mar = c(5, 4, 4, 2))
# The initial RF hit near-100% accuracy — a red flag for overfitting.
# Cross-validation confirms whether this holds across different data splits.
cv_control <- trainControl(
method = "cv",
number = 3,
verboseIter = TRUE,
savePredictions = "final"
)
rf_cv_model <- train(
fraud ~ ., data = train_data_rf,
method = "ranger",
trControl = cv_control,
tuneGrid = expand.grid(
mtry = c(2, 3),
splitrule = "gini",
min.node.size = c(5)
),
metric = "Accuracy",
num.trees = 200
)
## + Fold1: mtry=2, splitrule=gini, min.node.size=5
## - Fold1: mtry=2, splitrule=gini, min.node.size=5
## + Fold1: mtry=3, splitrule=gini, min.node.size=5
## - Fold1: mtry=3, splitrule=gini, min.node.size=5
## + Fold2: mtry=2, splitrule=gini, min.node.size=5
## - Fold2: mtry=2, splitrule=gini, min.node.size=5
## + Fold2: mtry=3, splitrule=gini, min.node.size=5
## - Fold2: mtry=3, splitrule=gini, min.node.size=5
## + Fold3: mtry=2, splitrule=gini, min.node.size=5
## - Fold3: mtry=2, splitrule=gini, min.node.size=5
## + Fold3: mtry=3, splitrule=gini, min.node.size=5
## - Fold3: mtry=3, splitrule=gini, min.node.size=5
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3, splitrule = gini, min.node.size = 5 on full training set
print(rf_cv_model)
## Random Forest
##
## 700001 samples
## 7 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 466668, 466668, 466666
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9999743 0.9998388
## 3 0.9999800 0.9998746
##
## Tuning parameter 'splitrule' was held constant at a value of gini
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = gini
## and min.node.size = 5.
# Predict on test data using CV model
cv_test_preds <- predict(rf_cv_model, newdata = test_data_rf)
cv_conf_matrix <- confusionMatrix(cv_test_preds, test_data_rf$fraud)
print(cv_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 273779 5
## 1 0 26215
##
## Accuracy : 1
## 95% CI : (1, 1)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9999
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9998
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.9126
## Detection Rate : 0.9126
## Detection Prevalence : 0.9126
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
# To demonstrate real-world utility, we test three transactions:
# a clearly safe one, a clearly fraudulent one, and a borderline case.
# This shows how the model behaves across different risk profiles.
example_transactions <- data.frame(
scenario = c("Low Risk", "High Risk", "Borderline"),
distance_from_home = c(2, 150, 25),
distance_from_last_transaction = c(1, 200, 13),
ratio_to_median_purchase_price = c(0.8, 8.5, 1.5),
repeat_retailer = c(1, 0, 1),
used_chip = c(1, 0, 1),
used_pin_number = c(1, 0, 0),
online_order = c(0, 1, 1)
)
# Predict fraud probability for each scenario using the interaction model
input_data <- example_transactions[, -1] # drop scenario label column
predicted_probs <- predict(model2, newdata = input_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "Fraud", "Not Fraud")
results <- example_transactions %>%
mutate(
Fraud_Probability = round(predicted_probs, 4),
Prediction = predicted_classes
)
kable(results, caption = "Example Transaction Predictions")
| scenario | distance_from_home | distance_from_last_transaction | ratio_to_median_purchase_price | repeat_retailer | used_chip | used_pin_number | online_order | Fraud_Probability | Prediction |
|---|---|---|---|---|---|---|---|---|---|
| Low Risk | 2 | 1 | 0.8 | 1 | 1 | 1 | 0 | 0.0000 | Not Fraud |
| High Risk | 150 | 200 | 8.5 | 0 | 0 | 0 | 1 | 0.7419 | Fraud |
| Borderline | 25 | 13 | 1.5 | 1 | 1 | 0 | 1 | 0.1121 | Not Fraud |
Interpretation: - Low Risk: Short
distance from home, PIN used, chip used, in-person — model assigns
near-zero fraud probability. Matches EDA finding that PIN usage drops
fraud rate to 0.27%. - High Risk: Far from home, far
from last transaction, high purchase ratio, online, no chip, no PIN —
model flags as high fraud probability. Every major risk factor is
present. - Borderline: Mixed signals — repeat retailer
and chip used, but online order with no PIN and moderate distance. Model
uses the interaction between distance_from_home and
repeat_retailer to moderate the risk.
# The default 0.5 threshold is not always optimal for fraud detection.
# Lowering the threshold catches more fraud (higher recall) but increases
# false positives — legitimate transactions flagged as fraud.
# This is a business decision: what is the cost of a missed fraud vs
# the cost of falsely declining a legitimate transaction?
thresholds <- c(0.3, 0.4, 0.5, 0.6, 0.7)
threshold_results <- do.call(rbind, lapply(thresholds, function(t) {
preds <- ifelse(test_probs > t, 1, 0)
cm <- table(Predicted = preds, Actual = test_data$fraud)
# Handle edge cases where a class may be missing
tp <- ifelse(!is.na(cm["1","1"]), cm["1","1"], 0)
fp <- ifelse(!is.na(cm["1","0"]), cm["1","0"], 0)
tn <- ifelse(!is.na(cm["0","0"]), cm["0","0"], 0)
fn <- ifelse(!is.na(cm["0","1"]), cm["0","1"], 0)
precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 0)
recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
f1 <- ifelse((precision + recall) > 0,
2 * precision * recall / (precision + recall), 0)
accuracy <- (tp + tn) / (tp + fp + tn + fn)
data.frame(
Threshold = t,
Accuracy = round(accuracy, 4),
Precision = round(precision, 4),
Recall = round(recall, 4),
F1_Score = round(f1, 4)
)
}))
kable(threshold_results, caption = "Model Performance Across Decision Thresholds")
| Threshold | Accuracy | Precision | Recall | F1_Score |
|---|---|---|---|---|
| 0.3 | 0.9319 | 0.8912 | 0.9840 | 0.9353 |
| 0.4 | 0.9408 | 0.9156 | 0.9712 | 0.9425 |
| 0.5 | 0.9403 | 0.9331 | 0.9486 | 0.9408 |
| 0.6 | 0.9320 | 0.9474 | 0.9148 | 0.9308 |
| 0.7 | 0.9190 | 0.9603 | 0.8742 | 0.9152 |
Business interpretation: - A lower threshold (0.3) prioritizes catching fraud — useful when the cost of a missed fraud is high (e.g. large transactions) - A higher threshold (0.7) reduces false positives — useful when customer experience matters more and false declines are costly - For most fraud systems, 0.4–0.5 balances both concerns well
# Consolidate all model metrics in one place for direct comparison.
# This gives a clear view of the tradeoffs between interpretability and performance.
# Logistic Regression metrics (from test set)
lr_preds <- ifelse(test_probs > 0.5, 1, 0)
lr_cm <- table(Predicted = lr_preds, Actual = test_data$fraud)
lr_acc <- sum(diag(lr_cm)) / sum(lr_cm)
lr_prec <- lr_cm["1","1"] / sum(lr_cm["1",])
lr_rec <- lr_cm["1","1"] / sum(lr_cm[,"1"])
lr_f1 <- 2 * lr_prec * lr_rec / (lr_prec + lr_rec)
lr_auc <- as.numeric(auc(roc(test_data$fraud, test_probs)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Random Forest metrics (from CV model test set)
rf_cm_obj <- confusionMatrix(cv_test_preds, test_data_rf$fraud)
rf_acc <- rf_cm_obj$overall["Accuracy"]
rf_prec <- rf_cm_obj$byClass["Precision"]
rf_rec <- rf_cm_obj$byClass["Recall"]
rf_f1 <- rf_cm_obj$byClass["F1"]
comparison_table <- data.frame(
Model = c("Logistic Regression", "Random Forest (CV)"),
Accuracy = round(c(lr_acc, rf_acc), 4),
Precision = round(c(lr_prec, rf_prec), 4),
Recall = round(c(lr_rec, rf_rec), 4),
F1_Score = round(c(lr_f1, rf_f1), 4),
AUC = c(round(lr_auc, 4), "N/A"),
Interpretable = c("Yes", "No")
)
kable(comparison_table, caption = "Model Comparison: Logistic Regression vs Random Forest")
| Model | Accuracy | Precision | Recall | F1_Score | AUC | Interpretable | |
|---|---|---|---|---|---|---|---|
| Logistic Regression | 0.9403 | 0.9331 | 0.9486 | 0.9408 | 0.9793 | Yes | |
| Accuracy | Random Forest (CV) | 1.0000 | 1.0000 | 1.0000 | 1.0000 | N/A | No |
##
## RECOMMENDATION
## ==============
## Both models perform well, but serve different purposes:
##
## Logistic Regression:
## - Fully interpretable — coefficients directly explain why a transaction is flagged
## - Suitable for regulated environments where explainability is required
## - AUC confirms strong discrimination ability
## - Recommended for: production fraud flagging with human review
##
## Random Forest:
## - Higher raw accuracy, confirmed stable via 3-fold cross-validation
## - Captures non-linear patterns and interactions automatically
## - Less interpretable — harder to explain to stakeholders why a transaction was flagged
## - Recommended for: automated batch scoring where throughput matters more than explainability
##
## Final recommendation: deploy logistic regression as the primary model for
## explainability, with RF as a secondary validation layer for high-value transactions.
Current limitations: - Dataset is synthetic — real-world fraud patterns are more complex and evolving - Bootstrapping is a simple balancing approach; SMOTE may produce better minority class representation - Models are static — fraud patterns shift over time, requiring periodic retraining
What I would do next: 1. SMOTE instead of bootstrapping for more realistic minority class augmentation 2. XGBoost — often outperforms RF on tabular fraud data with less tuning 3. Real-time scoring pipeline — wrap the model in a REST API (Plumber in R or FastAPI in Python) for live transaction scoring 4. Drift monitoring — track model performance over time as fraud patterns evolve 5. Cost-sensitive learning — assign different misclassification costs to false negatives vs false positives based on transaction value