Family Prediction Models

This page trains three category-specific(Adware, Riskware, Trojan) random forest models with 500 trees each to predict malware family within each category (Adware, Riskware, and Trojan).

Load Libraries and Data

# Load preprocessed data (from preprocessing.qmd)
andmal_after <- readRDS("data/processed/andmal_after.rds")

All three categories are imbalanced with certain families having thousands of samples, others having less than 10. The breakdown on the families of all 3 of these categories is available at the original site: https://www.unb.ca/cic/datasets/andmal2020.html

Feature Selection

# Metadata columns to exclude from features (keep only Category)
metadata_cols <- c("Family", "Category_file", "reboot_state", "path", "file", "Hash")

# Get all column names
all_cols <- names(andmal_after)

# Identify metadata columns that actually exist
metadata_cols_present <- intersect(metadata_cols, all_cols)

# Feature columns for family prediction (exclude metadata, keep Category)
feature_cols_family <- setdiff(all_cols, metadata_cols_present)

# Additional exclusion: any rank/color columns
feature_cols_family <- feature_cols_family[!str_detect(feature_cols_family, "rank|color|fam_color|rank_in_cat")]

Train Category-Specific Models

# Ensure all required libraries are loaded (defensive check for individual rendering)
if (!require(dplyr, quietly = TRUE)) library(dplyr)
if (!require(ranger, quietly = TRUE)) library(ranger)
if (!require(caret, quietly = TRUE)) library(caret)

# Categories to model
target_categories <- c("Adware", "Riskware", "Trojan")

  # Storage for models and training data
  category_models <- list()
  category_train_data_list <- list()
  category_test_data_list <- list()

# Calculate mtry for family models
mtry_family <- floor(sqrt(length(feature_cols_family)))

caret_available <- require("caret", quietly = TRUE)


# Train a model for each category
for (category in target_categories) {
  
  # Filter data for this category
  category_data <- andmal_after %>%
    filter(Category == category)
  
  if (nrow(category_data) == 0) {
    cat(sprintf("  WARNING: No samples found for category '%s'. Skipping.\n", category))
    next
  }
  
  # Check family distribution
  family_dist <- table(category_data$Family)
  
  # Skip if too few families or samples
  if (length(unique(category_data$Family)) < 2) {
    cat(sprintf("  WARNING: Category '%s' has fewer than 2 families. Skipping.\n", category))
    next
  }
  
  if (nrow(category_data) < 50) {
    cat(sprintf("  WARNING: Category '%s' has fewer than 50 samples. Skipping.\n", category))
    next
  }
  
  # Create train/test split stratified by Family
  set.seed(15)
  if (caret_available) {
    category_train_index <- caret::createDataPartition(
      category_data$Family,
      p = train_test_split,
      list = FALSE,
      times = 1
    )
    category_train <- category_data[category_train_index, ]
    category_test <- category_data[-category_train_index, ]
  } else {
    # Base R stratified sampling by Family
    families <- unique(category_data$Family)
    train_indices <- c()
    for (fam in families) {
      fam_indices <- which(category_data$Family == fam)
      if (length(fam_indices) > 1) {
        n_train <- max(1, round(length(fam_indices) * train_test_split))
        train_fam_indices <- sample(fam_indices, n_train)
        train_indices <- c(train_indices, train_fam_indices)
      } else {
        # If only one sample, put it in training
        train_indices <- c(train_indices, fam_indices)
      }
    }
    category_train <- category_data[train_indices, ]
    category_test <- category_data[-train_indices, ]
  }
  
  
  # Prepare data for training (ensure data.frame, not tibble)
  train_family_x <- as.data.frame(category_train[, feature_cols_family, drop = FALSE])
  train_family_y <- category_train$Family
  test_family_x <- as.data.frame(category_test[, feature_cols_family, drop = FALSE])
  test_family_y <- category_test$Family
  
  # Ensure Family factor levels match
  train_family_y <- factor(train_family_y, levels = unique(c(train_family_y, test_family_y)))
  test_family_y <- factor(test_family_y, levels = levels(train_family_y))
  
  # Train model
  start_time <- Sys.time()
  
  rf_family <- ranger(
    x = train_family_x,
    y = train_family_y,
    num.trees = num_trees,
    mtry = mtry_family,
    min.node.size = 1,
    num.threads = num_threads,
    classification = TRUE,
    probability = TRUE,
    importance = "impurity",
    verbose = FALSE
  )
  
  end_time <- Sys.time()
  training_time <- difftime(end_time, start_time, units = "mins")
  
  # Store model and training data (needed for LIME/SHAP)
  category_models[[category]] <- rf_family
  
  # Save model
  if (!dir.exists("data/models")) {
    dir.create("data/models", recursive = TRUE)
  }
  model_name <- paste0("data/models/rf_family_", tolower(category), "_model.rds")
  saveRDS(rf_family, model_name)
  
  # Store training data for this category (needed for LIME/SHAP explanations)
  # Save as data.frame to avoid tibble issues
  category_train_data_list[[category]] <- list(
    train_x = train_family_x,
    train_y = train_family_y
  )
  
  # Store test data for this category (needed for evaluation)
  category_test_data_list[[category]] <- list(
    test_x = test_family_x,
    test_y = test_family_y
  )
}

Evaluate Riskware Model

# Ensure required libraries are loaded
if (!require(caret, quietly = TRUE)) library(caret)

category <- "Riskware"

if (category %in% names(category_models)) {
  # Get test data for Riskware
  test_family_x <- category_test_data_list[[category]]$test_x
  test_family_y <- category_test_data_list[[category]]$test_y
  model <- category_models[[category]]
  
  # Generate predictions
  pred_riskware <- predict(model, test_family_x)
  pred_riskware_probs <- pred_riskware$predictions
  
  # Get predicted class (highest probability)
  pred_riskware_class <- colnames(pred_riskware_probs)[apply(pred_riskware_probs, 1, which.max)]
  pred_riskware_class <- factor(pred_riskware_class, levels = levels(test_family_y))
  
  # Confusion matrix (always use caret)
  cm_riskware <- caret::confusionMatrix(pred_riskware_class, test_family_y)
  
  # Calculate per-class metrics
  metrics_riskware <- cm_riskware$overall
  per_class_riskware <- cm_riskware$byClass
} else {
  stop("Riskware model not found. Please ensure the model was trained successfully.")
}

Riskware Model Performance Metrics

Riskware Family Prediction Model Performance Metrics
Family Overall Accuracy Precision Recall F1 Score
Class: smsreg 0.8994 0.8948 0.9937 0.9417
Class: skymobi 0.8994 0.9491 0.9812 0.9649
Class: smspay 0.8994 0.8182 0.4865 0.6102
Class: <unknown> 0.8994 NA 0.0000 NA
Class: jiagu 0.8994 0.7179 0.9333 0.8116
Class: revmob 0.8994 0.9022 0.9222 0.9121
Class: mobilepay 0.8994 1.0000 0.4444 0.6154
Class: triada 0.8994 1.0000 0.3333 0.5000
Class: anydown 0.8994 1.0000 0.6667 0.8000
Class: wificrack 0.8994 NA 0.0000 NA
Class: dnotua 0.8994 0.6667 0.6667 0.6667
Class: wapron 0.8994 1.0000 1.0000 1.0000
Class: metasploit 0.8994 1.0000 0.5000 0.6667
Class: tordow 0.8994 NA NA NA
Class: deng 0.8994 NA NA NA
Class: secneo 0.8994 NA NA NA
Class: tencentprotect 0.8994 NA NA NA
Class: kingroot 0.8994 NA NA NA

The model performs well for classes like smsreg which it has thousands of examples on. However its performance on dnotua family with 36 examples is not great. Adware and Trojan have even more classes and the imbalance gets worse.

Load Selected Instances

# Set seed for reproducibility
set.seed(15)

# Load the selected instances from category.qmd
if (file.exists("data/processed/selected_instances.rds")) {
  selected_instances_df <- readRDS("data/processed/selected_instances.rds")
} else {
  stop("Selected instances file not found. Please run category.qmd first to generate selected_instances.rds")
}

# Extract feature matrices for explanations (same pattern as category.qmd)
selected_instances_x <- selected_instances_df[, feature_cols_family, drop = FALSE]

Model Interpretability

Adware Family Model Explanations

# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)

# Set seed for reproducibility
set.seed(15)

category <- "Adware"

if (category %in% names(category_models)) {
  
  # Get the corresponding instance (same pattern as category.qmd)
  instance_idx <- which(selected_instances_df$Category == category)
  if (length(instance_idx) == 0) {
    cat(sprintf("No instance found for category %s\n", category))
  } else {
    # Get training data for this category
    train_family_x <- category_train_data_list[[category]]$train_x
    model <- category_models[[category]]
    
    #### LIME Explanation
    # Create LIME explainer
    explainer_family <- lime(
      train_family_x,
      model = model,
      bin_continuous = TRUE,
      n_bins = 5
    )
    
    # Extract instance (same pattern as category.qmd)
    instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    lime_explanation <- lime::explain(
      instance_x,
      explainer = explainer_family,
      n_features = 10,
      n_permutations = 5000,
      n_labels = 1
    )
    
    # Ensure explanation has proper structure for plotting
    if (nrow(lime_explanation) > 0) {
      # If explanation doesn't have 'case' column, add it
      if (!"case" %in% colnames(lime_explanation)) {
        lime_explanation$case <- 1
      }
      print(plot_features(lime_explanation))
    } else {
      invisible()  # LIME explanation is empty, skipping plot
    }
    
    #### Per-Instance SHAP Values
    
    # Create prediction function for fastshap
    pred_wrapper_family <- function(object, newdata) {
      pred <- predict(object, newdata)
      return(pred$predictions)
    }
    
    # Ensure newdata is a data.frame (same pattern as category.qmd)
    newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    shap_instance <- explain(
      model,
      X = train_family_x,
      newdata = newdata_df,
      pred_wrapper = pred_wrapper_family,
      nsim = 100
    )
    
    # Get predicted class probabilities
    pred_probs <- predict(model, newdata_df)$predictions
    pred_class <- colnames(pred_probs)[which.max(pred_probs)]
    
    # Handle SHAP values - fastshap returns a matrix/data.frame
    if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
      # Check if columns are classes or features
      if (pred_class %in% colnames(shap_instance)) {
        shap_df <- data.frame(
          feature = rownames(shap_instance),
          shap_value = shap_instance[[pred_class]]
        )
      } else if (ncol(shap_instance) == length(feature_cols_family)) {
        # Columns are features
        shap_df <- data.frame(
          feature = colnames(shap_instance),
          shap_value = as.numeric(shap_instance[1, ])
        )
      } else {
        # Try to extract first row or first column
        if (nrow(shap_instance) == 1) {
          shap_df <- data.frame(
            feature = colnames(shap_instance),
            shap_value = as.numeric(shap_instance[1, ])
          )
        } else {
          shap_df <- data.frame(
            feature = rownames(shap_instance),
            shap_value = as.numeric(shap_instance[, 1])
          )
        }
      }
      
      shap_df <- shap_df %>%
        arrange(desc(abs(shap_value))) %>%
        head(20)
      
      
      
      # Determine the order of magnitude (exponent) for scientific notation
      max_abs_value <- max(abs(shap_df$shap_value))
      if (max_abs_value > 0) {
        exponent <- floor(log10(max_abs_value))
        # Round to nearest multiple of 3 for cleaner display
        exponent <- round(exponent / 3) * 3
      } else {
        exponent <- 0
      }
      
      # Create custom label function that shows only significant digits
      # and scales by the exponent
      scale_factor <- 10^(-exponent)
      label_func <- function(x) {
        scaled <- x * scale_factor
        # Format with appropriate decimal places
        if (abs(exponent) >= 3) {
          sprintf("%.3f", scaled)
        } else {
          sprintf("%.4f", scaled)
        }
      }
      
      # Create visualization
      y_axis_label <- if (abs(exponent) >= 3) {
        sprintf("SHAP Value (×10^%d)", exponent)
      } else {
        "SHAP Value"
      }
      
      p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
        geom_col(aes(fill = shap_value > 0)) +
        scale_fill_manual(
          values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
          labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
          name = "SHAP Value"
        ) +
        scale_y_continuous(labels = label_func) +
        coord_flip() +
        labs(
          title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
          subtitle = sprintf("Top 20 features by absolute SHAP value"),
          x = "Feature",
          y = y_axis_label
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 14, face = "bold"),
          plot.subtitle = element_text(size = 12),
          axis.text.y = element_text(size = 8)
        )
      
      print(p_shap)
    } else {
      cat(sprintf("SHAP values format not recognized for %s instance\n", category))
    }
  }
} else {
  cat(sprintf("Model for %s not found.\n", category))
}

For the Adware sample, the SHAP plots shows that memory / UI / WebView and environment-probing features together push the model strongly toward the stopsms label. That matches the EDA story that Adware is both memory-heavy and very DB/WebView intensive. LIME largely highlights the same patterns but reshuffles the rankings a lot.

Riskware Family Model Explanations

# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)

# Set seed for reproducibility
set.seed(15)

category <- "Riskware"

if (category %in% names(category_models)) {
  
  # Get the corresponding instance (same pattern as category.qmd)
  instance_idx <- which(selected_instances_df$Category == category)
  if (length(instance_idx) == 0) {
    cat(sprintf("No instance found for category %s\n", category))
  } else {
    # Get training data for this category
    train_family_x <- category_train_data_list[[category]]$train_x
    model <- category_models[[category]]
    
    #### LIME Explanation
    cat(sprintf("\n--- LIME Explanation for %s Instance ---\n", category))
    
    # Create LIME explainer
    explainer_family <- lime(
      train_family_x,
      model = model,
      bin_continuous = TRUE,
      n_bins = 5
    )
    
    # Extract instance (same pattern as category.qmd)
    instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    lime_explanation <- lime::explain(
      instance_x,
      explainer = explainer_family,
      n_features = 10,
      n_permutations = 5000,
      n_labels = 1
    )
    
    cat("LIME Explanation:\n")
    # Ensure explanation has proper structure for plotting
    if (nrow(lime_explanation) > 0) {
      # If explanation doesn't have 'case' column, add it
      if (!"case" %in% colnames(lime_explanation)) {
        lime_explanation$case <- 1
      }
      print(plot_features(lime_explanation))
    } else {
      invisible()  # LIME explanation is empty, skipping plot
    }
    
    #### Per-Instance SHAP Values
    # Create prediction function for fastshap
    pred_wrapper_family <- function(object, newdata) {
      pred <- predict(object, newdata)
      return(pred$predictions)
    }
    
    # Ensure newdata is a data.frame (same pattern as category.qmd)
    newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    shap_instance <- explain(
      model,
      X = train_family_x,
      newdata = newdata_df,
      pred_wrapper = pred_wrapper_family,
      nsim = 100
    )
    
    # Get predicted class probabilities
    pred_probs <- predict(model, newdata_df)$predictions
    pred_class <- colnames(pred_probs)[which.max(pred_probs)]
    
    # Handle SHAP values - fastshap returns a matrix/data.frame
    if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
      # Check if columns are classes or features
      if (pred_class %in% colnames(shap_instance)) {
        shap_df <- data.frame(
          feature = rownames(shap_instance),
          shap_value = shap_instance[[pred_class]]
        )
      } else if (ncol(shap_instance) == length(feature_cols_family)) {
        # Columns are features
        shap_df <- data.frame(
          feature = colnames(shap_instance),
          shap_value = as.numeric(shap_instance[1, ])
        )
      } else {
        # Try to extract first row or first column
        if (nrow(shap_instance) == 1) {
          shap_df <- data.frame(
            feature = colnames(shap_instance),
            shap_value = as.numeric(shap_instance[1, ])
          )
        } else {
          shap_df <- data.frame(
            feature = rownames(shap_instance),
            shap_value = as.numeric(shap_instance[, 1])
          )
        }
      }
      
      shap_df <- shap_df %>%
        arrange(desc(abs(shap_value))) %>%
        head(20)
      
      # Determine the order of magnitude (exponent) for scientific notation
      max_abs_value <- max(abs(shap_df$shap_value))
      if (max_abs_value > 0) {
        exponent <- floor(log10(max_abs_value))
        # Round to nearest multiple of 3 for cleaner display
        exponent <- round(exponent / 3) * 3
      } else {
        exponent <- 0
      }
      
      # Create custom label function that shows only significant digits
      # and scales by the exponent
      scale_factor <- 10^(-exponent)
      label_func <- function(x) {
        scaled <- x * scale_factor
        # Format with appropriate decimal places
        if (abs(exponent) >= 3) {
          sprintf("%.3f", scaled)
        } else {
          sprintf("%.4f", scaled)
        }
      }
      
      # Create visualization
      y_axis_label <- if (abs(exponent) >= 3) {
        sprintf("SHAP Value (×10^%d)", exponent)
      } else {
        "SHAP Value"
      }
      
      p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
        geom_col(aes(fill = shap_value > 0)) +
        scale_fill_manual(
          values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
          labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
          name = "SHAP Value"
        ) +
        scale_y_continuous(labels = label_func) +
        coord_flip() +
        labs(
          title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
          subtitle = sprintf("Top 20 features by absolute SHAP value"),
          x = "Feature",
          y = y_axis_label
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 14, face = "bold"),
          plot.subtitle = element_text(size = 12),
          axis.text.y = element_text(size = 8)
        )
      
      print(p_shap)
    } else {
      cat(sprintf("SHAP values format not recognized for %s instance\n", category))
    }
  }
} else {
  cat(sprintf("Model for %s not found.\n", category))
}

--- LIME Explanation for Riskware Instance ---
LIME Explanation:

For the Riskware instance, family-level SHAP values emphasize process volume, SMS/telephony metadata (like TelephonyManager_getNetworkOperatorName and getDeviceId), receiver registration, and DB/API logging features as the main drivers of the smsreg family prediction. Several of these (e.g., TelephonyManager_getDeviceId, Memory_PssTotal) are also globally important in the category model, so the same “heavy, system-integrated, telephony-aware app” profile helps both classify it as Riskware and then refine it to an smsreg-style family.

Trojan Family Model Explanations

# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)

# Set seed for reproducibility
set.seed(15)

category <- "Trojan"

if (category %in% names(category_models)) {
  
  # Get the corresponding instance (same pattern as category.qmd)
  instance_idx <- which(selected_instances_df$Category == category)
  if (length(instance_idx) == 0) {
    cat(sprintf("No instance found for category %s\n", category))
  } else {
    # Get training data for this category
    train_family_x <- category_train_data_list[[category]]$train_x
    model <- category_models[[category]]
    
    #### LIME Explanation
    cat(sprintf("\n--- LIME Explanation for %s Instance ---\n", category))
    
    # Create LIME explainer
    explainer_family <- lime(
      train_family_x,
      model = model,
      bin_continuous = TRUE,
      n_bins = 5
    )
    
    # Extract instance (same pattern as category.qmd)
    instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    lime_explanation <- lime::explain(
      instance_x,
      explainer = explainer_family,
      n_features = 10,
      n_permutations = 5000,
      n_labels = 1
    )
    
    cat("LIME Explanation:\n")
    # Ensure explanation has proper structure for plotting
    if (nrow(lime_explanation) > 0) {
      # If explanation doesn't have 'case' column, add it
      if (!"case" %in% colnames(lime_explanation)) {
        lime_explanation$case <- 1
      }
      print(plot_features(lime_explanation))
    } else {
      invisible()  # LIME explanation is empty, skipping plot
    }
    
    #### Per-Instance SHAP Values
    cat(sprintf("\n--- Per-Instance SHAP Values for %s Instance ---\n", category))
    
    # Create prediction function for fastshap
    pred_wrapper_family <- function(object, newdata) {
      pred <- predict(object, newdata)
      return(pred$predictions)
    }
    
    # Ensure newdata is a data.frame (same pattern as category.qmd)
    newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
    
    shap_instance <- explain(
      model,
      X = train_family_x,
      newdata = newdata_df,
      pred_wrapper = pred_wrapper_family,
      nsim = 100
    )
    
    # Get predicted class probabilities
    pred_probs <- predict(model, newdata_df)$predictions
    pred_class <- colnames(pred_probs)[which.max(pred_probs)]
    
    # Handle SHAP values - fastshap returns a matrix/data.frame
    if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
      # Check if columns are classes or features
      if (pred_class %in% colnames(shap_instance)) {
        shap_df <- data.frame(
          feature = rownames(shap_instance),
          shap_value = shap_instance[[pred_class]]
        )
      } else if (ncol(shap_instance) == length(feature_cols_family)) {
        # Columns are features
        shap_df <- data.frame(
          feature = colnames(shap_instance),
          shap_value = as.numeric(shap_instance[1, ])
        )
      } else {
        # Try to extract first row or first column
        if (nrow(shap_instance) == 1) {
          shap_df <- data.frame(
            feature = colnames(shap_instance),
            shap_value = as.numeric(shap_instance[1, ])
          )
        } else {
          shap_df <- data.frame(
            feature = rownames(shap_instance),
            shap_value = as.numeric(shap_instance[, 1])
          )
        }
      }
      
      shap_df <- shap_df %>%
        arrange(desc(abs(shap_value))) %>%
        head(20)
      
      cat(sprintf("Top 20 SHAP values (predicted class: %s):\n", pred_class))
      print(shap_df)
      
      # Determine the order of magnitude (exponent) for scientific notation
      max_abs_value <- max(abs(shap_df$shap_value))
      if (max_abs_value > 0) {
        exponent <- floor(log10(max_abs_value))
        # Round to nearest multiple of 3 for cleaner display
        exponent <- round(exponent / 3) * 3
      } else {
        exponent <- 0
      }
      
      # Create custom label function that shows only significant digits
      # and scales by the exponent
      scale_factor <- 10^(-exponent)
      label_func <- function(x) {
        scaled <- x * scale_factor
        # Format with appropriate decimal places
        if (abs(exponent) >= 3) {
          sprintf("%.3f", scaled)
        } else {
          sprintf("%.4f", scaled)
        }
      }
      
      # Create visualization
      y_axis_label <- if (abs(exponent) >= 3) {
        sprintf("SHAP Value (×10^%d)", exponent)
      } else {
        "SHAP Value"
      }
      
      p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
        geom_col(aes(fill = shap_value > 0)) +
        scale_fill_manual(
          values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
          labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
          name = "SHAP Value"
        ) +
        scale_y_continuous(labels = label_func) +
        coord_flip() +
        labs(
          title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
          subtitle = sprintf("Top 20 features by absolute SHAP value"),
          x = "Feature",
          y = y_axis_label
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 14, face = "bold"),
          plot.subtitle = element_text(size = 12),
          axis.text.y = element_text(size = 8)
        )
      
      print(p_shap)
    } else {
      cat(sprintf("SHAP values format not recognized for %s instance\n", category))
    }
  }
} else {
  cat(sprintf("Model for %s not found.\n", category))
}

--- LIME Explanation for Trojan Instance ---
LIME Explanation:


--- Per-Instance SHAP Values for Trojan Instance ---
Top 20 SHAP values (predicted class: mytrackp):
                                                                  feature
1                                                      Memory_SharedDirty
2                                                         env_probe_count
3                                                          Logcat_warning
4                                                      Memory_ParcelCount
5                                                        Memory_HeapAlloc
6              API_Database_android.database.sqlite.SQLiteDatabase_update
7                      API_Crypto-Hash_java.security.MessageDigest_update
8                                                        Battery_wakelock
9                                              Network_TotalReceivedBytes
10                                                         accounts_calls
11 API_DeviceData_android.content.ContentResolver_registerContentObserver
12                    API_IPC_android.content.ContextWrapper_startService
13                                                        Memory_HeapSize
14   API_Database_android.database.sqlite.SQLiteDatabase_compileStatement
15       API_DexClassLoader_dalvik.system.BaseDexClassLoader_findResource
16                         API_DeviceData_android.os.SystemProperties_get
17                                          Network_TotalTransmittedBytes
18                                                          API__sessions
19                                                    total_DB_read_calls
20                                           Network_TotalReceivedPackets
      shap_value
1   3.825407e-19
2   3.608775e-19
3  -3.176506e-19
4  -2.517067e-19
5  -2.269832e-19
6   2.196397e-19
7  -2.159814e-19
8   2.149976e-19
9  -1.939161e-19
10  1.858948e-19
11 -1.739025e-19
12 -1.525658e-19
13 -1.509509e-19
14 -1.457875e-19
15 -1.297350e-19
16 -1.291733e-19
17  1.277596e-19
18  1.267326e-19
19 -1.251119e-19
20  1.235404e-19

For the Trojan sample, the family SHAP plot shows a classic spyware / tracking fingerprint: large shared memory and heap usage, frequent env probes, warning-level logcat activity, DB updates/compile calls, crypto hashing, wakelocks, and substantial network I/O all push toward the mytrackp family. Many of these resource-intensive memory and environment features (e.g., Memory_SharedDirty, Memory_HeapAlloc, env_probe_count) are also among the top global drivers of the category model, so the same “noisy, always-on, data-hungry” behavior explains both why the model flags the app as a Trojan at the category level and why it narrows in specifically on mytrackp at the family level. LIME coefficients also show some agreement but are very unstable for all these family classifiers, I think this is because we have too many features ~150 for a linear model with ~10 features to capture the details even locally.