1. Setup & Data Loading

# Load all required libraries upfront
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)
library(stats)
library(pROC)
library(randomForest)
library(ranger)
library(caret)

# Load dataset
# Place card_transdata.csv in your working directory before running
# Dataset: https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud
df <- read_csv("card_transdata.csv", show_col_types = FALSE)

cat("Dimensions:", dim(df), "\n")
## Dimensions: 1000000 8

2. Exploratory Data Analysis

2.1 Dataset Overview

# Column summary: data types, missing values, and averages
summary_table <- df %>%
  summarise(
    Column_Name = names(.),
    Data_Type = sapply(., class),
    Missing_Values = sapply(., function(col) sum(is.na(col))),
    Average = sapply(., function(col) if (is.numeric(col)) round(mean(col, na.rm = TRUE), 4) else NA)
  ) %>%
  as.data.frame()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
kable(summary_table, caption = "Column Summary: Data Type, Missing Values, and Averages")
Column Summary: Data Type, Missing Values, and Averages
Column_Name Data_Type Missing_Values Average
distance_from_home numeric 0 26.6288
distance_from_last_transaction numeric 0 5.0365
ratio_to_median_purchase_price numeric 0 1.8242
repeat_retailer numeric 0 0.8815
used_chip numeric 0 0.3504
used_pin_number numeric 0 0.1006
online_order numeric 0 0.6506
fraud numeric 0 0.0874
# Full statistical summary of all variables
summary(df)
##  distance_from_home  distance_from_last_transaction
##  Min.   :    0.005   Min.   :    0.000             
##  1st Qu.:    3.878   1st Qu.:    0.297             
##  Median :    9.968   Median :    0.999             
##  Mean   :   26.629   Mean   :    5.037             
##  3rd Qu.:   25.744   3rd Qu.:    3.356             
##  Max.   :10632.724   Max.   :11851.105             
##  ratio_to_median_purchase_price repeat_retailer    used_chip     
##  Min.   :  0.0044               Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:  0.4757               1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :  0.9977               Median :1.0000   Median :0.0000  
##  Mean   :  1.8242               Mean   :0.8815   Mean   :0.3504  
##  3rd Qu.:  2.0964               3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :267.8029               Max.   :1.0000   Max.   :1.0000  
##  used_pin_number   online_order        fraud       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.1006   Mean   :0.6506   Mean   :0.0874  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

2.2 Class Distribution

# The target variable (fraud) is heavily imbalanced at ~8.74% fraud rate.
# This needs to be addressed before modeling to avoid biased predictions.
fraud_counts <- table(df$fraud)

barplot(fraud_counts,
        main = "Fraud Count Distribution",
        xlab = "Fraud Indicator",
        ylab = "Number of Transactions",
        col = c("green", "red"),
        names.arg = c("No Fraud", "Fraud"))

cat("Fraud rate:", round(mean(df$fraud) * 100, 2), "%\n")
## Fraud rate: 8.74 %

2.3 Fraud Rate by Binary Variables

# Before modeling, we examine fraud rates across binary features.
# This helps us understand which variables carry the most predictive signal.
binary_vars <- c("repeat_retailer", "used_chip", "used_pin_number", "online_order")

fraud_rate_table <- do.call(rbind, lapply(binary_vars, function(var) {
  df %>%
    group_by(Value = .data[[var]]) %>%
    summarise(Fraud_Rate = round(mean(fraud) * 100, 2), .groups = "drop") %>%
    mutate(Variable = var) %>%
    select(Variable, Value, Fraud_Rate)
}))

kable(fraud_rate_table, caption = "Fraud Rate by Binary Variable")
Fraud Rate by Binary Variable
Variable Value Fraud_Rate
repeat_retailer 0 8.84
repeat_retailer 1 8.73
used_chip 0 10.01
used_chip 1 6.40
used_pin_number 0 9.69
used_pin_number 1 0.27
online_order 0 1.34
online_order 1 12.71

Key findings: - used_pin_number is the strongest signal — fraud rate drops from 9.7% to just 0.27% when PIN is used - online_order nearly 10x the fraud rate compared to in-person transactions (12.7% vs 1.3%) - repeat_retailer shows almost no difference — motivating the interaction term tested later

2.4 Continuous Variable Distributions

# Raw distributions — all three variables are heavily right-skewed
# with extreme outliers, making log transformation necessary
variables <- c("distance_from_home", "distance_from_last_transaction", "ratio_to_median_purchase_price")

par(mfrow = c(1, 3), mar = c(5, 4, 4, 2))
for (var in variables) {
  hist(df[[var]],
       main = paste("Raw:", var),
       xlab = var,
       col = "grey",
       border = "black",
       breaks = 25)
}

par(mfrow = c(1, 1))
# After log + sqrt transformation, distributions are more symmetric
# and suitable for modeling
par(mfrow = c(1, 3), mar = c(5, 4, 4, 2))
for (var in variables) {
  log_var <- (log(df[[var]] + 1))^(0.5)
  hist(log_var,
       main = paste("Transformed:", var),
       xlab = var,
       col = "grey",
       border = "black",
       breaks = 25)
}

par(mfrow = c(1, 1))

2.5 Fraud vs Non-Fraud: Continuous Variables

# Box plots comparing fraud vs non-fraud across continuous variables.
# Clear separation confirms these variables are meaningful predictors.
aliases <- c(
  "distance_from_home" = "Distance from Home",
  "distance_from_last_transaction" = "Distance from Last Transaction",
  "ratio_to_median_purchase_price" = "Ratio to Median Purchase Price"
)

for (var in variables) {
  p <- ggplot(df, aes(x = factor(fraud, levels = c(0, 1), labels = c("No", "Yes")),
                      y = log(df[[var]] + 1)^(0.5), fill = factor(fraud))) +
    geom_boxplot(outlier.shape = NA) +
    labs(
      title = paste("Fraud Status Comparison:", aliases[var]),
      x = "Fraud Status",
      y = paste("Transformed", aliases[var])
    ) +
    scale_fill_manual(values = c("green", "red"), name = "Fraud Status") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  print(p)
}

2.6 Binary Variables vs Fraud

# Mosaic plots show the proportion of fraud within each binary category.
# Larger red areas indicate higher fraud concentration.
aliases_binary <- c(
  "repeat_retailer" = "Repeat Retailer",
  "used_chip" = "Used Chip",
  "used_pin_number" = "Used PIN",
  "online_order" = "Online Order"
)

par(mfrow = c(2, 2), mar = c(5, 4, 4, 2))
for (var in binary_vars) {
  contingency_table <- table(df[[var]], df$fraud)
  mosaicplot(
    contingency_table,
    main = paste("Mosaic:", aliases_binary[var], "vs Fraud"),
    xlab = aliases_binary[var],
    ylab = "Fraud",
    color = TRUE
  )
}

par(mfrow = c(1, 1))

2.7 Correlation Matrix

# Check for multicollinearity between predictors.
# High correlation between distance_from_home and repeat_retailer
# motivates testing an interaction term in the logistic regression.
numeric_cols <- sapply(df, is.numeric)
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")
print(round(cor_matrix, 3))
##                                distance_from_home
## distance_from_home                          1.000
## distance_from_last_transaction              0.000
## ratio_to_median_purchase_price             -0.001
## repeat_retailer                             0.143
## used_chip                                  -0.001
## used_pin_number                            -0.002
## online_order                               -0.001
## fraud                                       0.188
##                                distance_from_last_transaction
## distance_from_home                                      0.000
## distance_from_last_transaction                          1.000
## ratio_to_median_purchase_price                          0.001
## repeat_retailer                                        -0.001
## used_chip                                               0.002
## used_pin_number                                        -0.001
## online_order                                            0.000
## fraud                                                   0.092
##                                ratio_to_median_purchase_price repeat_retailer
## distance_from_home                                     -0.001           0.143
## distance_from_last_transaction                          0.001          -0.001
## ratio_to_median_purchase_price                          1.000           0.001
## repeat_retailer                                         0.001           1.000
## used_chip                                               0.001          -0.001
## used_pin_number                                         0.001           0.000
## online_order                                            0.000          -0.001
## fraud                                                   0.462          -0.001
##                                used_chip used_pin_number online_order  fraud
## distance_from_home                -0.001          -0.002       -0.001  0.188
## distance_from_last_transaction     0.002          -0.001        0.000  0.092
## ratio_to_median_purchase_price     0.001           0.001        0.000  0.462
## repeat_retailer                   -0.001           0.000       -0.001 -0.001
## used_chip                          1.000          -0.001        0.000 -0.061
## used_pin_number                   -0.001           1.000        0.000 -0.100
## online_order                       0.000           0.000        1.000  0.192
## fraud                             -0.061          -0.100        0.192  1.000
# Visualizing distance_from_home vs repeat_retailer to assess
# whether an interaction term is warranted in the model
p <- ggplot(df, aes(x = factor(repeat_retailer, levels = c(0, 1), labels = c("No", "Yes")),
                    y = log(df$distance_from_home)^(0.5), fill = factor(repeat_retailer))) +
  geom_boxplot(outlier.shape = NA) +
  labs(
    title = "Distance from Home vs Repeat Retailer",
    x = "Repeat Retailer",
    y = "Transformed Distance from Home"
  ) +
  scale_fill_manual(values = c("lightblue", "lightgreen"), name = "Repeat Retailer") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
print(p)
## Warning: Removed 50181 rows containing non-finite outside the scale range
## (`stat_boxplot()`).


3. Data Cleaning & Balancing

# The dataset is imbalanced (8.74% fraud). Training on imbalanced data causes
# the model to predict the majority class almost exclusively.
# We use bootstrapping (oversampling with replacement) to balance the classes.

# Check original distribution
cat("Original class distribution:\n")
## Original class distribution:
print(table(df$fraud))
## 
##      0      1 
## 912597  87403
# Separate minority and majority classes
minority <- df[df$fraud == 1, ]
majority <- df[df$fraud == 0, ]

# Bootstrap minority class to match majority size
set.seed(123)
minority_boot <- minority[sample(1:nrow(minority), size = nrow(majority), replace = TRUE), ]

# Combine into balanced dataset
balanced_df <- rbind(minority_boot, majority)

cat("\nBalanced class distribution:\n")
## 
## Balanced class distribution:
print(table(balanced_df$fraud))
## 
##      0      1 
## 912597 912597
# Confirm balance visually
fraud_counts_balanced <- table(balanced_df$fraud)
barplot(fraud_counts_balanced,
        main = "Fraud Distribution After Bootstrapping",
        xlab = "Fraud Indicator",
        ylab = "Number of Transactions",
        col = c("green", "red"),
        names.arg = c("No Fraud", "Fraud"))


4. Train / Test Split

# 70/30 train-test split on the balanced dataset
# set.seed ensures reproducibility
set.seed(123)
train_ratio <- 0.7
train_indices <- sample(1:nrow(balanced_df), size = floor(train_ratio * nrow(balanced_df)))

train_data <- balanced_df[train_indices, ]
test_data  <- balanced_df[-train_indices, ]

cat("Training set size:", nrow(train_data), "\n")
## Training set size: 1277635
cat("Testing set size: ", nrow(test_data), "\n")
## Testing set size:  547559
cat("\nTraining class distribution:\n")
## 
## Training class distribution:
print(table(train_data$fraud))
## 
##      0      1 
## 638833 638802
cat("\nTesting class distribution:\n")
## 
## Testing class distribution:
print(table(test_data$fraud))
## 
##      0      1 
## 273764 273795

5. Logistic Regression

5.1 Feature Selection via Backward Elimination

# Start with all predictors and use AIC-based backward elimination
# to identify the most informative subset of features.
full_model <- glm(fraud ~ ., data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
best_model <- step(full_model, direction = "backward")
## Start:  AIC=531319.7
## fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                                  Df Deviance     AIC
## <none>                                531304  531320
## - repeat_retailer                 1   547714  547728
## - used_chip                       1   553652  553666
## - used_pin_number                 1   608215  608229
## - distance_from_last_transaction  1   650006  650020
## - online_order                    1   714796  714810
## - distance_from_home              1   808774  808788
## - ratio_to_median_purchase_price  1  1331140 1331154
summary(best_model)
## 
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order, family = binomial, data = train_data)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -7.662e+00  1.987e-02  -385.6   <2e-16 ***
## distance_from_home              2.891e-02  8.214e-05   352.0   <2e-16 ***
## distance_from_last_transaction  4.977e-02  2.085e-04   238.7   <2e-16 ***
## ratio_to_median_purchase_price  1.212e+00  2.469e-03   490.7   <2e-16 ***
## repeat_retailer                -1.422e+00  1.159e-02  -122.7   <2e-16 ***
## used_chip                      -1.200e+00  8.354e-03  -143.6   <2e-16 ***
## used_pin_number                -9.992e+00  5.845e-02  -170.9   <2e-16 ***
## online_order                    5.027e+00  1.586e-02   316.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1771178  on 1277634  degrees of freedom
## Residual deviance:  531304  on 1277627  degrees of freedom
## AIC: 531320
## 
## Number of Fisher Scoring iterations: 10

5.2 Base Model (No Interaction)

# Fit logistic regression with all predictors selected by backward elimination.
# This serves as the baseline before testing interaction terms.
model <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
               ratio_to_median_purchase_price + repeat_retailer + used_chip +
               used_pin_number + online_order,
             data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order, family = binomial, data = train_data)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -7.662e+00  1.987e-02  -385.6   <2e-16 ***
## distance_from_home              2.891e-02  8.214e-05   352.0   <2e-16 ***
## distance_from_last_transaction  4.977e-02  2.085e-04   238.7   <2e-16 ***
## ratio_to_median_purchase_price  1.212e+00  2.469e-03   490.7   <2e-16 ***
## repeat_retailer                -1.422e+00  1.159e-02  -122.7   <2e-16 ***
## used_chip                      -1.200e+00  8.354e-03  -143.6   <2e-16 ***
## used_pin_number                -9.992e+00  5.845e-02  -170.9   <2e-16 ***
## online_order                    5.027e+00  1.586e-02   316.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1771178  on 1277634  degrees of freedom
## Residual deviance:  531304  on 1277627  degrees of freedom
## AIC: 531320
## 
## Number of Fisher Scoring iterations: 10
# Predict on training data
predicted_probs <- predict(model, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

cat("\nTraining Confusion Matrix:\n")
## 
## Training Confusion Matrix:
print(table(Predicted = predicted_classes, Actual = train_data$fraud))
##          Actual
## Predicted      0      1
##         0 596010  32468
##         1  42823 606334
# Fitted probability plot — well-separated distributions indicate good model fit
ggplot(train_data, aes(x = predicted_probs, fill = factor(fraud))) +
  geom_density(alpha = 0.5) +
  labs(title = "Fitted Probability Plot (Base Model)",
       x = "Predicted Probability",
       fill = "Fraud (0 = No, 1 = Yes)") +
  theme_minimal()

5.3 Interaction Model

# EDA showed correlation between distance_from_home and repeat_retailer.
# We test whether their interaction improves model fit.
model2 <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
                ratio_to_median_purchase_price + repeat_retailer + used_chip +
                used_pin_number + online_order + (distance_from_home * repeat_retailer),
              data = train_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model2)
## 
## Call:
## glm(formula = fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order + (distance_from_home * repeat_retailer), 
##     family = binomial, data = train_data)
## 
## Coefficients:
##                                      Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                        -7.5099682  0.0295355 -254.269  < 2e-16 ***
## distance_from_home                 -0.1114567  0.0203595   -5.474 4.39e-08 ***
## distance_from_last_transaction      0.0497784  0.0002085  238.711  < 2e-16 ***
## ratio_to_median_purchase_price      1.2118307  0.0024695  490.726  < 2e-16 ***
## repeat_retailer                    -1.5758123  0.0250949  -62.794  < 2e-16 ***
## used_chip                          -1.1998418  0.0083543 -143.620  < 2e-16 ***
## used_pin_number                    -9.9935244  0.0584509 -170.973  < 2e-16 ***
## online_order                        5.0282505  0.0158635  316.971  < 2e-16 ***
## distance_from_home:repeat_retailer  0.1403734  0.0203603    6.894 5.41e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1771178  on 1277634  degrees of freedom
## Residual deviance:  531252  on 1277626  degrees of freedom
## AIC: 531270
## 
## Number of Fisher Scoring iterations: 9
predicted_probs_m2 <- predict(model2, type = "response")
predicted_classes_m2 <- ifelse(predicted_probs_m2 > 0.5, 1, 0)

cat("\nTraining Confusion Matrix (Interaction Model):\n")
## 
## Training Confusion Matrix (Interaction Model):
print(table(Predicted = predicted_classes_m2, Actual = train_data$fraud))
##          Actual
## Predicted      0      1
##         0 596017  32434
##         1  42816 606368
ggplot(train_data, aes(x = predicted_probs_m2, fill = factor(fraud))) +
  geom_density(alpha = 0.5) +
  labs(title = "Fitted Probability Plot (Interaction Model)",
       x = "Predicted Probability",
       fill = "Fraud (0 = No, 1 = Yes)") +
  theme_minimal()

5.4 Model Comparison: Likelihood Ratio Test

# Likelihood Ratio Test compares the two models.
# A significant p-value means the interaction term adds meaningful explanatory power.
model_no_interaction <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
                               ratio_to_median_purchase_price + repeat_retailer + used_chip +
                               used_pin_number + online_order,
                             family = "binomial", data = balanced_df)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_interaction <- glm(fraud ~ distance_from_home + distance_from_last_transaction +
                            ratio_to_median_purchase_price + repeat_retailer + used_chip +
                            used_pin_number + online_order + (distance_from_home * repeat_retailer),
                          family = "binomial", data = balanced_df)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
anova(model_no_interaction, model_interaction, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order
## Model 2: fraud ~ distance_from_home + distance_from_last_transaction + 
##     ratio_to_median_purchase_price + repeat_retailer + used_chip + 
##     used_pin_number + online_order + (distance_from_home * repeat_retailer)
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1   1825186     759376                          
## 2   1825185     759287  1   88.875 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.5 Test Set Evaluation

# Evaluate the base model on held-out test data.
# Precision and recall are more informative than accuracy for imbalanced fraud data.
test_probs <- predict(model, newdata = test_data, type = "response")
test_preds <- ifelse(test_probs > 0.5, 1, 0)

conf_matrix <- table(Predicted = test_preds, Actual = test_data$fraud)
print(conf_matrix)
##          Actual
## Predicted      0      1
##         0 255145  14080
##         1  18619 259715
accuracy  <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
recall    <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
f1_score  <- 2 * (precision * recall) / (precision + recall)

cat("Accuracy: ", round(accuracy, 4), "\n")
## Accuracy:  0.9403
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.9331
cat("Recall:   ", round(recall, 4), "\n")
## Recall:    0.9486
cat("F1-Score: ", round(f1_score, 4), "\n")
## F1-Score:  0.9408
# ROC curve and AUC measure discrimination ability across all thresholds.
# AUC of 1.0 = perfect, 0.5 = random chance.
roc_obj <- roc(test_data$fraud, test_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- auc(roc_obj)
plot(roc_obj, main = paste("ROC Curve (AUC =", round(auc_val, 3), ")"))

cat("AUC:", round(auc_val, 3), "\n")
## AUC: 0.94

6. Random Forest

# Random Forest is an ensemble method that builds many decision trees and
# aggregates their predictions. It handles non-linear relationships and
# interactions naturally, making it well-suited for fraud detection.

# Convert fraud to factor for classification
df$fraud <- as.factor(df$fraud)

# Train/test split on original (unbalanced) df for RF
# RF handles class imbalance better than logistic regression
set.seed(123)
train_index_rf <- createDataPartition(df$fraud, p = 0.7, list = FALSE)
train_data_rf  <- df[train_index_rf, ]
test_data_rf   <- df[-train_index_rf, ]

# Fit RF with ranger (memory-efficient implementation)
# num.trees=200 balances accuracy and compute time
# mtry=3 = number of variables tried at each split
rf_model <- ranger(fraud ~ ., data = train_data_rf,
                   num.trees = 200, mtry = 3, importance = "impurity")

print(rf_model)
## Ranger result
## 
## Call:
##  ranger(fraud ~ ., data = train_data_rf, num.trees = 200, mtry = 3,      importance = "impurity") 
## 
## Type:                             Classification 
## Number of trees:                  200 
## Sample size:                      700001 
## Number of independent variables:  7 
## Mtry:                             3 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             0.00 %
# Predict on test data
rf_predictions      <- predict(rf_model, data = test_data_rf, type = "response")
rf_predicted_classes <- rf_predictions$predictions

# Confusion matrix
confusionMatrix(as.factor(rf_predicted_classes), test_data_rf$fraud)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 273779      5
##          1      0  26215
##                                  
##                Accuracy : 1      
##                  95% CI : (1, 1) 
##     No Information Rate : 0.9126 
##     P-Value [Acc > NIR] : < 2e-16
##                                  
##                   Kappa : 0.9999 
##                                  
##  Mcnemar's Test P-Value : 0.07364
##                                  
##             Sensitivity : 1.0000 
##             Specificity : 0.9998 
##          Pos Pred Value : 1.0000 
##          Neg Pred Value : 1.0000 
##              Prevalence : 0.9126 
##          Detection Rate : 0.9126 
##    Detection Prevalence : 0.9126 
##       Balanced Accuracy : 0.9999 
##                                  
##        'Positive' Class : 0      
## 
# Note: Near-perfect accuracy is expected here but warrants cross-validation
# to rule out overfitting
# Feature importance shows which variables drive predictions the most.
# Higher impurity reduction = more important variable.
importance <- rf_model$variable.importance
sorted_importance <- sort(importance, decreasing = TRUE)

par(mar = c(5, 11, 8, 2))
barplot(
  sorted_importance,
  main = "Random Forest Feature Importance",
  horiz = TRUE,
  las = 1,
  col = "steelblue",
  xlab = "Importance Score",
  names.arg = names(sorted_importance),
  cex.names = 0.8
)

par(mar = c(5, 4, 4, 2))

6.1 Cross-Validation

# The initial RF hit near-100% accuracy — a red flag for overfitting.
# Cross-validation confirms whether this holds across different data splits.
cv_control <- trainControl(
  method = "cv",
  number = 3,
  verboseIter = TRUE,
  savePredictions = "final"
)

rf_cv_model <- train(
  fraud ~ ., data = train_data_rf,
  method = "ranger",
  trControl = cv_control,
  tuneGrid = expand.grid(
    mtry = c(2, 3),
    splitrule = "gini",
    min.node.size = c(5)
  ),
  metric = "Accuracy",
  num.trees = 200
)
## + Fold1: mtry=2, splitrule=gini, min.node.size=5 
## - Fold1: mtry=2, splitrule=gini, min.node.size=5 
## + Fold1: mtry=3, splitrule=gini, min.node.size=5 
## - Fold1: mtry=3, splitrule=gini, min.node.size=5 
## + Fold2: mtry=2, splitrule=gini, min.node.size=5 
## - Fold2: mtry=2, splitrule=gini, min.node.size=5 
## + Fold2: mtry=3, splitrule=gini, min.node.size=5 
## - Fold2: mtry=3, splitrule=gini, min.node.size=5 
## + Fold3: mtry=2, splitrule=gini, min.node.size=5 
## - Fold3: mtry=2, splitrule=gini, min.node.size=5 
## + Fold3: mtry=3, splitrule=gini, min.node.size=5 
## - Fold3: mtry=3, splitrule=gini, min.node.size=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3, splitrule = gini, min.node.size = 5 on full training set
print(rf_cv_model)
## Random Forest 
## 
## 700001 samples
##      7 predictor
##      2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 466668, 466668, 466666 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9999743  0.9998388
##   3     0.9999800  0.9998746
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = gini
##  and min.node.size = 5.
# Predict on test data using CV model
cv_test_preds <- predict(rf_cv_model, newdata = test_data_rf)
cv_conf_matrix <- confusionMatrix(cv_test_preds, test_data_rf$fraud)
print(cv_conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 273779      5
##          1      0  26215
##                                  
##                Accuracy : 1      
##                  95% CI : (1, 1) 
##     No Information Rate : 0.9126 
##     P-Value [Acc > NIR] : < 2e-16
##                                  
##                   Kappa : 0.9999 
##                                  
##  Mcnemar's Test P-Value : 0.07364
##                                  
##             Sensitivity : 1.0000 
##             Specificity : 0.9998 
##          Pos Pred Value : 1.0000 
##          Neg Pred Value : 1.0000 
##              Prevalence : 0.9126 
##          Detection Rate : 0.9126 
##    Detection Prevalence : 0.9126 
##       Balanced Accuracy : 0.9999 
##                                  
##        'Positive' Class : 0      
## 

7. Example Predictions

# To demonstrate real-world utility, we test three transactions:
# a clearly safe one, a clearly fraudulent one, and a borderline case.
# This shows how the model behaves across different risk profiles.

example_transactions <- data.frame(
  scenario = c("Low Risk", "High Risk", "Borderline"),
  distance_from_home = c(2, 150, 25),
  distance_from_last_transaction = c(1, 200, 13),
  ratio_to_median_purchase_price = c(0.8, 8.5, 1.5),
  repeat_retailer = c(1, 0, 1),
  used_chip = c(1, 0, 1),
  used_pin_number = c(1, 0, 0),
  online_order = c(0, 1, 1)
)

# Predict fraud probability for each scenario using the interaction model
input_data <- example_transactions[, -1]  # drop scenario label column
predicted_probs <- predict(model2, newdata = input_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "Fraud", "Not Fraud")

results <- example_transactions %>%
  mutate(
    Fraud_Probability = round(predicted_probs, 4),
    Prediction = predicted_classes
  )

kable(results, caption = "Example Transaction Predictions")
Example Transaction Predictions
scenario distance_from_home distance_from_last_transaction ratio_to_median_purchase_price repeat_retailer used_chip used_pin_number online_order Fraud_Probability Prediction
Low Risk 2 1 0.8 1 1 1 0 0.0000 Not Fraud
High Risk 150 200 8.5 0 0 0 1 0.7419 Fraud
Borderline 25 13 1.5 1 1 0 1 0.1121 Not Fraud

Interpretation: - Low Risk: Short distance from home, PIN used, chip used, in-person — model assigns near-zero fraud probability. Matches EDA finding that PIN usage drops fraud rate to 0.27%. - High Risk: Far from home, far from last transaction, high purchase ratio, online, no chip, no PIN — model flags as high fraud probability. Every major risk factor is present. - Borderline: Mixed signals — repeat retailer and chip used, but online order with no PIN and moderate distance. Model uses the interaction between distance_from_home and repeat_retailer to moderate the risk.


7.1 Threshold Analysis

# The default 0.5 threshold is not always optimal for fraud detection.
# Lowering the threshold catches more fraud (higher recall) but increases
# false positives — legitimate transactions flagged as fraud.
# This is a business decision: what is the cost of a missed fraud vs
# the cost of falsely declining a legitimate transaction?

thresholds <- c(0.3, 0.4, 0.5, 0.6, 0.7)

threshold_results <- do.call(rbind, lapply(thresholds, function(t) {
  preds <- ifelse(test_probs > t, 1, 0)
  cm <- table(Predicted = preds, Actual = test_data$fraud)
  
  # Handle edge cases where a class may be missing
  tp <- ifelse(!is.na(cm["1","1"]), cm["1","1"], 0)
  fp <- ifelse(!is.na(cm["1","0"]), cm["1","0"], 0)
  tn <- ifelse(!is.na(cm["0","0"]), cm["0","0"], 0)
  fn <- ifelse(!is.na(cm["0","1"]), cm["0","1"], 0)
  
  precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 0)
  recall    <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
  f1        <- ifelse((precision + recall) > 0,
                      2 * precision * recall / (precision + recall), 0)
  accuracy  <- (tp + tn) / (tp + fp + tn + fn)
  
  data.frame(
    Threshold = t,
    Accuracy  = round(accuracy, 4),
    Precision = round(precision, 4),
    Recall    = round(recall, 4),
    F1_Score  = round(f1, 4)
  )
}))

kable(threshold_results, caption = "Model Performance Across Decision Thresholds")
Model Performance Across Decision Thresholds
Threshold Accuracy Precision Recall F1_Score
0.3 0.9319 0.8912 0.9840 0.9353
0.4 0.9408 0.9156 0.9712 0.9425
0.5 0.9403 0.9331 0.9486 0.9408
0.6 0.9320 0.9474 0.9148 0.9308
0.7 0.9190 0.9603 0.8742 0.9152

Business interpretation: - A lower threshold (0.3) prioritizes catching fraud — useful when the cost of a missed fraud is high (e.g. large transactions) - A higher threshold (0.7) reduces false positives — useful when customer experience matters more and false declines are costly - For most fraud systems, 0.4–0.5 balances both concerns well


8. Model Comparison & Conclusion

8.1 Model Comparison

# Consolidate all model metrics in one place for direct comparison.
# This gives a clear view of the tradeoffs between interpretability and performance.

# Logistic Regression metrics (from test set)
lr_preds <- ifelse(test_probs > 0.5, 1, 0)
lr_cm    <- table(Predicted = lr_preds, Actual = test_data$fraud)
lr_acc   <- sum(diag(lr_cm)) / sum(lr_cm)
lr_prec  <- lr_cm["1","1"] / sum(lr_cm["1",])
lr_rec   <- lr_cm["1","1"] / sum(lr_cm[,"1"])
lr_f1    <- 2 * lr_prec * lr_rec / (lr_prec + lr_rec)
lr_auc   <- as.numeric(auc(roc(test_data$fraud, test_probs)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Random Forest metrics (from CV model test set)
rf_cm_obj <- confusionMatrix(cv_test_preds, test_data_rf$fraud)
rf_acc    <- rf_cm_obj$overall["Accuracy"]
rf_prec   <- rf_cm_obj$byClass["Precision"]
rf_rec    <- rf_cm_obj$byClass["Recall"]
rf_f1     <- rf_cm_obj$byClass["F1"]

comparison_table <- data.frame(
  Model = c("Logistic Regression", "Random Forest (CV)"),
  Accuracy  = round(c(lr_acc, rf_acc), 4),
  Precision = round(c(lr_prec, rf_prec), 4),
  Recall    = round(c(lr_rec, rf_rec), 4),
  F1_Score  = round(c(lr_f1, rf_f1), 4),
  AUC       = c(round(lr_auc, 4), "N/A"),
  Interpretable = c("Yes", "No")
)

kable(comparison_table, caption = "Model Comparison: Logistic Regression vs Random Forest")
Model Comparison: Logistic Regression vs Random Forest
Model Accuracy Precision Recall F1_Score AUC Interpretable
Logistic Regression 0.9403 0.9331 0.9486 0.9408 0.9793 Yes
Accuracy Random Forest (CV) 1.0000 1.0000 1.0000 1.0000 N/A No

8.2 Recommendation

## 
## RECOMMENDATION
## ==============
## Both models perform well, but serve different purposes:
## 
## Logistic Regression:
## - Fully interpretable — coefficients directly explain why a transaction is flagged
## - Suitable for regulated environments where explainability is required
## - AUC confirms strong discrimination ability
## - Recommended for: production fraud flagging with human review
## 
## Random Forest:
## - Higher raw accuracy, confirmed stable via 3-fold cross-validation
## - Captures non-linear patterns and interactions automatically
## - Less interpretable — harder to explain to stakeholders why a transaction was flagged
## - Recommended for: automated batch scoring where throughput matters more than explainability
## 
## Final recommendation: deploy logistic regression as the primary model for
## explainability, with RF as a secondary validation layer for high-value transactions.

8.3 Limitations & Next Steps

Current limitations: - Dataset is synthetic — real-world fraud patterns are more complex and evolving - Bootstrapping is a simple balancing approach; SMOTE may produce better minority class representation - Models are static — fraud patterns shift over time, requiring periodic retraining

What I would do next: 1. SMOTE instead of bootstrapping for more realistic minority class augmentation 2. XGBoost — often outperforms RF on tabular fraud data with less tuning 3. Real-time scoring pipeline — wrap the model in a REST API (Plumber in R or FastAPI in Python) for live transaction scoring 4. Drift monitoring — track model performance over time as fraud patterns evolve 5. Cost-sensitive learning — assign different misclassification costs to false negatives vs false positives based on transaction value