# Load preprocessed data (from preprocessing.qmd)
andmal_after <- readRDS("data/processed/andmal_after.rds")Family Prediction Models
This page trains three category-specific(Adware, Riskware, Trojan) random forest models with 500 trees each to predict malware family within each category (Adware, Riskware, and Trojan).
Load Libraries and Data
All three categories are imbalanced with certain families having thousands of samples, others having less than 10. The breakdown on the families of all 3 of these categories is available at the original site: https://www.unb.ca/cic/datasets/andmal2020.html
Feature Selection
# Metadata columns to exclude from features (keep only Category)
metadata_cols <- c("Family", "Category_file", "reboot_state", "path", "file", "Hash")
# Get all column names
all_cols <- names(andmal_after)
# Identify metadata columns that actually exist
metadata_cols_present <- intersect(metadata_cols, all_cols)
# Feature columns for family prediction (exclude metadata, keep Category)
feature_cols_family <- setdiff(all_cols, metadata_cols_present)
# Additional exclusion: any rank/color columns
feature_cols_family <- feature_cols_family[!str_detect(feature_cols_family, "rank|color|fam_color|rank_in_cat")]Train Category-Specific Models
# Ensure all required libraries are loaded (defensive check for individual rendering)
if (!require(dplyr, quietly = TRUE)) library(dplyr)
if (!require(ranger, quietly = TRUE)) library(ranger)
if (!require(caret, quietly = TRUE)) library(caret)
# Categories to model
target_categories <- c("Adware", "Riskware", "Trojan")
# Storage for models and training data
category_models <- list()
category_train_data_list <- list()
category_test_data_list <- list()
# Calculate mtry for family models
mtry_family <- floor(sqrt(length(feature_cols_family)))
caret_available <- require("caret", quietly = TRUE)
# Train a model for each category
for (category in target_categories) {
# Filter data for this category
category_data <- andmal_after %>%
filter(Category == category)
if (nrow(category_data) == 0) {
cat(sprintf(" WARNING: No samples found for category '%s'. Skipping.\n", category))
next
}
# Check family distribution
family_dist <- table(category_data$Family)
# Skip if too few families or samples
if (length(unique(category_data$Family)) < 2) {
cat(sprintf(" WARNING: Category '%s' has fewer than 2 families. Skipping.\n", category))
next
}
if (nrow(category_data) < 50) {
cat(sprintf(" WARNING: Category '%s' has fewer than 50 samples. Skipping.\n", category))
next
}
# Create train/test split stratified by Family
set.seed(15)
if (caret_available) {
category_train_index <- caret::createDataPartition(
category_data$Family,
p = train_test_split,
list = FALSE,
times = 1
)
category_train <- category_data[category_train_index, ]
category_test <- category_data[-category_train_index, ]
} else {
# Base R stratified sampling by Family
families <- unique(category_data$Family)
train_indices <- c()
for (fam in families) {
fam_indices <- which(category_data$Family == fam)
if (length(fam_indices) > 1) {
n_train <- max(1, round(length(fam_indices) * train_test_split))
train_fam_indices <- sample(fam_indices, n_train)
train_indices <- c(train_indices, train_fam_indices)
} else {
# If only one sample, put it in training
train_indices <- c(train_indices, fam_indices)
}
}
category_train <- category_data[train_indices, ]
category_test <- category_data[-train_indices, ]
}
# Prepare data for training (ensure data.frame, not tibble)
train_family_x <- as.data.frame(category_train[, feature_cols_family, drop = FALSE])
train_family_y <- category_train$Family
test_family_x <- as.data.frame(category_test[, feature_cols_family, drop = FALSE])
test_family_y <- category_test$Family
# Ensure Family factor levels match
train_family_y <- factor(train_family_y, levels = unique(c(train_family_y, test_family_y)))
test_family_y <- factor(test_family_y, levels = levels(train_family_y))
# Train model
start_time <- Sys.time()
rf_family <- ranger(
x = train_family_x,
y = train_family_y,
num.trees = num_trees,
mtry = mtry_family,
min.node.size = 1,
num.threads = num_threads,
classification = TRUE,
probability = TRUE,
importance = "impurity",
verbose = FALSE
)
end_time <- Sys.time()
training_time <- difftime(end_time, start_time, units = "mins")
# Store model and training data (needed for LIME/SHAP)
category_models[[category]] <- rf_family
# Save model
if (!dir.exists("data/models")) {
dir.create("data/models", recursive = TRUE)
}
model_name <- paste0("data/models/rf_family_", tolower(category), "_model.rds")
saveRDS(rf_family, model_name)
# Store training data for this category (needed for LIME/SHAP explanations)
# Save as data.frame to avoid tibble issues
category_train_data_list[[category]] <- list(
train_x = train_family_x,
train_y = train_family_y
)
# Store test data for this category (needed for evaluation)
category_test_data_list[[category]] <- list(
test_x = test_family_x,
test_y = test_family_y
)
}Evaluate Riskware Model
# Ensure required libraries are loaded
if (!require(caret, quietly = TRUE)) library(caret)
category <- "Riskware"
if (category %in% names(category_models)) {
# Get test data for Riskware
test_family_x <- category_test_data_list[[category]]$test_x
test_family_y <- category_test_data_list[[category]]$test_y
model <- category_models[[category]]
# Generate predictions
pred_riskware <- predict(model, test_family_x)
pred_riskware_probs <- pred_riskware$predictions
# Get predicted class (highest probability)
pred_riskware_class <- colnames(pred_riskware_probs)[apply(pred_riskware_probs, 1, which.max)]
pred_riskware_class <- factor(pred_riskware_class, levels = levels(test_family_y))
# Confusion matrix (always use caret)
cm_riskware <- caret::confusionMatrix(pred_riskware_class, test_family_y)
# Calculate per-class metrics
metrics_riskware <- cm_riskware$overall
per_class_riskware <- cm_riskware$byClass
} else {
stop("Riskware model not found. Please ensure the model was trained successfully.")
}Riskware Model Performance Metrics
| Family | Overall Accuracy | Precision | Recall | F1 Score |
|---|---|---|---|---|
| Class: smsreg | 0.8994 | 0.8948 | 0.9937 | 0.9417 |
| Class: skymobi | 0.8994 | 0.9491 | 0.9812 | 0.9649 |
| Class: smspay | 0.8994 | 0.8182 | 0.4865 | 0.6102 |
| Class: <unknown> | 0.8994 | NA | 0.0000 | NA |
| Class: jiagu | 0.8994 | 0.7179 | 0.9333 | 0.8116 |
| Class: revmob | 0.8994 | 0.9022 | 0.9222 | 0.9121 |
| Class: mobilepay | 0.8994 | 1.0000 | 0.4444 | 0.6154 |
| Class: triada | 0.8994 | 1.0000 | 0.3333 | 0.5000 |
| Class: anydown | 0.8994 | 1.0000 | 0.6667 | 0.8000 |
| Class: wificrack | 0.8994 | NA | 0.0000 | NA |
| Class: dnotua | 0.8994 | 0.6667 | 0.6667 | 0.6667 |
| Class: wapron | 0.8994 | 1.0000 | 1.0000 | 1.0000 |
| Class: metasploit | 0.8994 | 1.0000 | 0.5000 | 0.6667 |
| Class: tordow | 0.8994 | NA | NA | NA |
| Class: deng | 0.8994 | NA | NA | NA |
| Class: secneo | 0.8994 | NA | NA | NA |
| Class: tencentprotect | 0.8994 | NA | NA | NA |
| Class: kingroot | 0.8994 | NA | NA | NA |
The model performs well for classes like smsreg which it has thousands of examples on. However its performance on dnotua family with 36 examples is not great. Adware and Trojan have even more classes and the imbalance gets worse.
Load Selected Instances
# Set seed for reproducibility
set.seed(15)
# Load the selected instances from category.qmd
if (file.exists("data/processed/selected_instances.rds")) {
selected_instances_df <- readRDS("data/processed/selected_instances.rds")
} else {
stop("Selected instances file not found. Please run category.qmd first to generate selected_instances.rds")
}
# Extract feature matrices for explanations (same pattern as category.qmd)
selected_instances_x <- selected_instances_df[, feature_cols_family, drop = FALSE]Model Interpretability
Adware Family Model Explanations
# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)
# Set seed for reproducibility
set.seed(15)
category <- "Adware"
if (category %in% names(category_models)) {
# Get the corresponding instance (same pattern as category.qmd)
instance_idx <- which(selected_instances_df$Category == category)
if (length(instance_idx) == 0) {
cat(sprintf("No instance found for category %s\n", category))
} else {
# Get training data for this category
train_family_x <- category_train_data_list[[category]]$train_x
model <- category_models[[category]]
#### LIME Explanation
# Create LIME explainer
explainer_family <- lime(
train_family_x,
model = model,
bin_continuous = TRUE,
n_bins = 5
)
# Extract instance (same pattern as category.qmd)
instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
lime_explanation <- lime::explain(
instance_x,
explainer = explainer_family,
n_features = 10,
n_permutations = 5000,
n_labels = 1
)
# Ensure explanation has proper structure for plotting
if (nrow(lime_explanation) > 0) {
# If explanation doesn't have 'case' column, add it
if (!"case" %in% colnames(lime_explanation)) {
lime_explanation$case <- 1
}
print(plot_features(lime_explanation))
} else {
invisible() # LIME explanation is empty, skipping plot
}
#### Per-Instance SHAP Values
# Create prediction function for fastshap
pred_wrapper_family <- function(object, newdata) {
pred <- predict(object, newdata)
return(pred$predictions)
}
# Ensure newdata is a data.frame (same pattern as category.qmd)
newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
shap_instance <- explain(
model,
X = train_family_x,
newdata = newdata_df,
pred_wrapper = pred_wrapper_family,
nsim = 100
)
# Get predicted class probabilities
pred_probs <- predict(model, newdata_df)$predictions
pred_class <- colnames(pred_probs)[which.max(pred_probs)]
# Handle SHAP values - fastshap returns a matrix/data.frame
if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
# Check if columns are classes or features
if (pred_class %in% colnames(shap_instance)) {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = shap_instance[[pred_class]]
)
} else if (ncol(shap_instance) == length(feature_cols_family)) {
# Columns are features
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
# Try to extract first row or first column
if (nrow(shap_instance) == 1) {
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = as.numeric(shap_instance[, 1])
)
}
}
shap_df <- shap_df %>%
arrange(desc(abs(shap_value))) %>%
head(20)
# Determine the order of magnitude (exponent) for scientific notation
max_abs_value <- max(abs(shap_df$shap_value))
if (max_abs_value > 0) {
exponent <- floor(log10(max_abs_value))
# Round to nearest multiple of 3 for cleaner display
exponent <- round(exponent / 3) * 3
} else {
exponent <- 0
}
# Create custom label function that shows only significant digits
# and scales by the exponent
scale_factor <- 10^(-exponent)
label_func <- function(x) {
scaled <- x * scale_factor
# Format with appropriate decimal places
if (abs(exponent) >= 3) {
sprintf("%.3f", scaled)
} else {
sprintf("%.4f", scaled)
}
}
# Create visualization
y_axis_label <- if (abs(exponent) >= 3) {
sprintf("SHAP Value (×10^%d)", exponent)
} else {
"SHAP Value"
}
p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
geom_col(aes(fill = shap_value > 0)) +
scale_fill_manual(
values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
name = "SHAP Value"
) +
scale_y_continuous(labels = label_func) +
coord_flip() +
labs(
title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
subtitle = sprintf("Top 20 features by absolute SHAP value"),
x = "Feature",
y = y_axis_label
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.text.y = element_text(size = 8)
)
print(p_shap)
} else {
cat(sprintf("SHAP values format not recognized for %s instance\n", category))
}
}
} else {
cat(sprintf("Model for %s not found.\n", category))
}For the Adware sample, the SHAP plots shows that memory / UI / WebView and environment-probing features together push the model strongly toward the stopsms label. That matches the EDA story that Adware is both memory-heavy and very DB/WebView intensive. LIME largely highlights the same patterns but reshuffles the rankings a lot.
Riskware Family Model Explanations
# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)
# Set seed for reproducibility
set.seed(15)
category <- "Riskware"
if (category %in% names(category_models)) {
# Get the corresponding instance (same pattern as category.qmd)
instance_idx <- which(selected_instances_df$Category == category)
if (length(instance_idx) == 0) {
cat(sprintf("No instance found for category %s\n", category))
} else {
# Get training data for this category
train_family_x <- category_train_data_list[[category]]$train_x
model <- category_models[[category]]
#### LIME Explanation
cat(sprintf("\n--- LIME Explanation for %s Instance ---\n", category))
# Create LIME explainer
explainer_family <- lime(
train_family_x,
model = model,
bin_continuous = TRUE,
n_bins = 5
)
# Extract instance (same pattern as category.qmd)
instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
lime_explanation <- lime::explain(
instance_x,
explainer = explainer_family,
n_features = 10,
n_permutations = 5000,
n_labels = 1
)
cat("LIME Explanation:\n")
# Ensure explanation has proper structure for plotting
if (nrow(lime_explanation) > 0) {
# If explanation doesn't have 'case' column, add it
if (!"case" %in% colnames(lime_explanation)) {
lime_explanation$case <- 1
}
print(plot_features(lime_explanation))
} else {
invisible() # LIME explanation is empty, skipping plot
}
#### Per-Instance SHAP Values
# Create prediction function for fastshap
pred_wrapper_family <- function(object, newdata) {
pred <- predict(object, newdata)
return(pred$predictions)
}
# Ensure newdata is a data.frame (same pattern as category.qmd)
newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
shap_instance <- explain(
model,
X = train_family_x,
newdata = newdata_df,
pred_wrapper = pred_wrapper_family,
nsim = 100
)
# Get predicted class probabilities
pred_probs <- predict(model, newdata_df)$predictions
pred_class <- colnames(pred_probs)[which.max(pred_probs)]
# Handle SHAP values - fastshap returns a matrix/data.frame
if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
# Check if columns are classes or features
if (pred_class %in% colnames(shap_instance)) {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = shap_instance[[pred_class]]
)
} else if (ncol(shap_instance) == length(feature_cols_family)) {
# Columns are features
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
# Try to extract first row or first column
if (nrow(shap_instance) == 1) {
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = as.numeric(shap_instance[, 1])
)
}
}
shap_df <- shap_df %>%
arrange(desc(abs(shap_value))) %>%
head(20)
# Determine the order of magnitude (exponent) for scientific notation
max_abs_value <- max(abs(shap_df$shap_value))
if (max_abs_value > 0) {
exponent <- floor(log10(max_abs_value))
# Round to nearest multiple of 3 for cleaner display
exponent <- round(exponent / 3) * 3
} else {
exponent <- 0
}
# Create custom label function that shows only significant digits
# and scales by the exponent
scale_factor <- 10^(-exponent)
label_func <- function(x) {
scaled <- x * scale_factor
# Format with appropriate decimal places
if (abs(exponent) >= 3) {
sprintf("%.3f", scaled)
} else {
sprintf("%.4f", scaled)
}
}
# Create visualization
y_axis_label <- if (abs(exponent) >= 3) {
sprintf("SHAP Value (×10^%d)", exponent)
} else {
"SHAP Value"
}
p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
geom_col(aes(fill = shap_value > 0)) +
scale_fill_manual(
values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
name = "SHAP Value"
) +
scale_y_continuous(labels = label_func) +
coord_flip() +
labs(
title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
subtitle = sprintf("Top 20 features by absolute SHAP value"),
x = "Feature",
y = y_axis_label
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.text.y = element_text(size = 8)
)
print(p_shap)
} else {
cat(sprintf("SHAP values format not recognized for %s instance\n", category))
}
}
} else {
cat(sprintf("Model for %s not found.\n", category))
}
--- LIME Explanation for Riskware Instance ---
LIME Explanation:
For the Riskware instance, family-level SHAP values emphasize process volume, SMS/telephony metadata (like TelephonyManager_getNetworkOperatorName and getDeviceId), receiver registration, and DB/API logging features as the main drivers of the smsreg family prediction. Several of these (e.g., TelephonyManager_getDeviceId, Memory_PssTotal) are also globally important in the category model, so the same “heavy, system-integrated, telephony-aware app” profile helps both classify it as Riskware and then refine it to an smsreg-style family.
Trojan Family Model Explanations
# Ensure required libraries are loaded
if (!require(lime, quietly = TRUE)) library(lime)
if (!require(fastshap, quietly = TRUE)) library(fastshap)
if (!require(ggplot2, quietly = TRUE)) library(ggplot2)
if (!require(dplyr, quietly = TRUE)) library(dplyr)
# Set seed for reproducibility
set.seed(15)
category <- "Trojan"
if (category %in% names(category_models)) {
# Get the corresponding instance (same pattern as category.qmd)
instance_idx <- which(selected_instances_df$Category == category)
if (length(instance_idx) == 0) {
cat(sprintf("No instance found for category %s\n", category))
} else {
# Get training data for this category
train_family_x <- category_train_data_list[[category]]$train_x
model <- category_models[[category]]
#### LIME Explanation
cat(sprintf("\n--- LIME Explanation for %s Instance ---\n", category))
# Create LIME explainer
explainer_family <- lime(
train_family_x,
model = model,
bin_continuous = TRUE,
n_bins = 5
)
# Extract instance (same pattern as category.qmd)
instance_x <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
lime_explanation <- lime::explain(
instance_x,
explainer = explainer_family,
n_features = 10,
n_permutations = 5000,
n_labels = 1
)
cat("LIME Explanation:\n")
# Ensure explanation has proper structure for plotting
if (nrow(lime_explanation) > 0) {
# If explanation doesn't have 'case' column, add it
if (!"case" %in% colnames(lime_explanation)) {
lime_explanation$case <- 1
}
print(plot_features(lime_explanation))
} else {
invisible() # LIME explanation is empty, skipping plot
}
#### Per-Instance SHAP Values
cat(sprintf("\n--- Per-Instance SHAP Values for %s Instance ---\n", category))
# Create prediction function for fastshap
pred_wrapper_family <- function(object, newdata) {
pred <- predict(object, newdata)
return(pred$predictions)
}
# Ensure newdata is a data.frame (same pattern as category.qmd)
newdata_df <- as.data.frame(selected_instances_x[instance_idx, , drop = FALSE])
shap_instance <- explain(
model,
X = train_family_x,
newdata = newdata_df,
pred_wrapper = pred_wrapper_family,
nsim = 100
)
# Get predicted class probabilities
pred_probs <- predict(model, newdata_df)$predictions
pred_class <- colnames(pred_probs)[which.max(pred_probs)]
# Handle SHAP values - fastshap returns a matrix/data.frame
if (is.data.frame(shap_instance) || is.matrix(shap_instance)) {
# Check if columns are classes or features
if (pred_class %in% colnames(shap_instance)) {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = shap_instance[[pred_class]]
)
} else if (ncol(shap_instance) == length(feature_cols_family)) {
# Columns are features
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
# Try to extract first row or first column
if (nrow(shap_instance) == 1) {
shap_df <- data.frame(
feature = colnames(shap_instance),
shap_value = as.numeric(shap_instance[1, ])
)
} else {
shap_df <- data.frame(
feature = rownames(shap_instance),
shap_value = as.numeric(shap_instance[, 1])
)
}
}
shap_df <- shap_df %>%
arrange(desc(abs(shap_value))) %>%
head(20)
cat(sprintf("Top 20 SHAP values (predicted class: %s):\n", pred_class))
print(shap_df)
# Determine the order of magnitude (exponent) for scientific notation
max_abs_value <- max(abs(shap_df$shap_value))
if (max_abs_value > 0) {
exponent <- floor(log10(max_abs_value))
# Round to nearest multiple of 3 for cleaner display
exponent <- round(exponent / 3) * 3
} else {
exponent <- 0
}
# Create custom label function that shows only significant digits
# and scales by the exponent
scale_factor <- 10^(-exponent)
label_func <- function(x) {
scaled <- x * scale_factor
# Format with appropriate decimal places
if (abs(exponent) >= 3) {
sprintf("%.3f", scaled)
} else {
sprintf("%.4f", scaled)
}
}
# Create visualization
y_axis_label <- if (abs(exponent) >= 3) {
sprintf("SHAP Value (×10^%d)", exponent)
} else {
"SHAP Value"
}
p_shap <- ggplot(shap_df, aes(x = reorder(feature, shap_value), y = shap_value)) +
geom_col(aes(fill = shap_value > 0)) +
scale_fill_manual(
values = c("TRUE" = "#2E8B57", "FALSE" = "#DC143C"),
labels = c("TRUE" = "Positive", "FALSE" = "Negative"),
name = "SHAP Value"
) +
scale_y_continuous(labels = label_func) +
coord_flip() +
labs(
title = sprintf("SHAP Values - %s Instance (%s Family Model)", category, pred_class),
subtitle = sprintf("Top 20 features by absolute SHAP value"),
x = "Feature",
y = y_axis_label
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.text.y = element_text(size = 8)
)
print(p_shap)
} else {
cat(sprintf("SHAP values format not recognized for %s instance\n", category))
}
}
} else {
cat(sprintf("Model for %s not found.\n", category))
}
--- LIME Explanation for Trojan Instance ---
LIME Explanation:
--- Per-Instance SHAP Values for Trojan Instance ---
Top 20 SHAP values (predicted class: mytrackp):
feature
1 Memory_SharedDirty
2 env_probe_count
3 Logcat_warning
4 Memory_ParcelCount
5 Memory_HeapAlloc
6 API_Database_android.database.sqlite.SQLiteDatabase_update
7 API_Crypto-Hash_java.security.MessageDigest_update
8 Battery_wakelock
9 Network_TotalReceivedBytes
10 accounts_calls
11 API_DeviceData_android.content.ContentResolver_registerContentObserver
12 API_IPC_android.content.ContextWrapper_startService
13 Memory_HeapSize
14 API_Database_android.database.sqlite.SQLiteDatabase_compileStatement
15 API_DexClassLoader_dalvik.system.BaseDexClassLoader_findResource
16 API_DeviceData_android.os.SystemProperties_get
17 Network_TotalTransmittedBytes
18 API__sessions
19 total_DB_read_calls
20 Network_TotalReceivedPackets
shap_value
1 3.825407e-19
2 3.608775e-19
3 -3.176506e-19
4 -2.517067e-19
5 -2.269832e-19
6 2.196397e-19
7 -2.159814e-19
8 2.149976e-19
9 -1.939161e-19
10 1.858948e-19
11 -1.739025e-19
12 -1.525658e-19
13 -1.509509e-19
14 -1.457875e-19
15 -1.297350e-19
16 -1.291733e-19
17 1.277596e-19
18 1.267326e-19
19 -1.251119e-19
20 1.235404e-19
For the Trojan sample, the family SHAP plot shows a classic spyware / tracking fingerprint: large shared memory and heap usage, frequent env probes, warning-level logcat activity, DB updates/compile calls, crypto hashing, wakelocks, and substantial network I/O all push toward the mytrackp family. Many of these resource-intensive memory and environment features (e.g., Memory_SharedDirty, Memory_HeapAlloc, env_probe_count) are also among the top global drivers of the category model, so the same “noisy, always-on, data-hungry” behavior explains both why the model flags the app as a Trojan at the category level and why it narrows in specifically on mytrackp at the family level. LIME coefficients also show some agreement but are very unstable for all these family classifiers, I think this is because we have too many features ~150 for a linear model with ~10 features to capture the details even locally.