# Clinical prediction models to diagnose neonatal sepsis in low-income and middle-income countries: a scoping review
# Neal and Sturrock et al.
# Updated 13/09/2024

library(tidyverse)
library(here)
library(readxl)
library(ggsci)

## General comments
# 'study_dat' df relates to n included studies (n=82)
# 'evals_dat' df relates to n model evaluations (n=109)
# 'model_dat' df relates to n distinct models (n=44)
# Be careful: some calcs are performed in both frames of reference, with different results

# Read in master data file
path <- here("DataExtraction/NeoSepModsReviewData.xlsx")
dat <- read_xlsx(path, sheet = "Studies", na = "NA")
# View(dat)

# Characteristics of studies ----

# n included studies
id <- paste(dat$`First author`, dat$Year)
id <- id[!grepl("NA NA", id)]
length(id)

# Study years
min(dat$Year, na.rm = T)
max(dat$Year, na.rm = T)

# Study countries
# Extract country data
study_vars <- c(
  "First author",
  "Year",
  "Country of study",
  "WHO region",
  "Income",
  "Context",
  "Number of participants included",
  "Outcome definition",
  "Characteristics of included participants",
  "Risk category of participants"
)

study_dat <- dat |>
  select(!!study_vars) |>
  drop_na(c("First author", "Year"))
study_dat
nrow(study_dat) # matches n included studies

# Extract country strings
# Separate entries for those with multiple countries per study
sc_string <- study_dat$`Country of study` |>
  strsplit(split = ", ") |>
  unlist()
who_string <- study_dat$`WHO region` |>
  strsplit(split = ", ") |>
  unlist()
income_string <- study_dat$Income |>
  strsplit(split = ", ") |>
  unlist()

# n unique countries
length(unique(sc_string))
study_dat |> count(`Country of study`, sort = T) # some multiple countries
table(sc_string) |> sort(decreasing = T) # unlisted
sc_table <- as_tibble(table(sc_string))
# write_csv(sc_table, here("DataExtraction/sc_string_table.csv"))
37 / 82 * 100

# WHO regions
study_dat |> count(`WHO region`, sort = T) # some multiple regions
table(who_string) |> sort(decreasing = T) # unlisted
48 / 82 * 100
4 / 82 * 100

# Economic status
study_dat |> count(Income, sort = T)
table(income_string)
52 / 82 * 100
30 / 82 * 100

# Study context
any(is.na(study_dat$Context)) # no missing values
study_dat |> count(Context, sort = T)
64 / 82 * 100
12 / 82 * 100
3 / 82 * 100
1 / 82 * 100
2 / 82 * 100

# n participants
sum(study_dat$`Number of participants included`, na.rm = T)
hist(study_dat$`Number of participants included`, breaks = 20) # non-normal
median(study_dat$`Number of participants included`, na.rm = T)
range(study_dat$`Number of participants included`, na.rm = T)
IQR(study_dat$`Number of participants included`, na.rm = T)

# Study population
head(study_dat$`Characteristics of included participants`)
study_dat$`Characteristics of included participants`[grepl(
  "preterm", study_dat$`Characteristics of included participants`, ignore.case = T)]
# n=4 preterm only
4 / 82 * 100
study_dat$`Characteristics of included participants`[grepl(
  "low", study_dat$`Characteristics of included participants`, ignore.case = T)]
# n=5 low birth weight
5 / 82 * 100

# Risk category by study
table(study_dat$`Risk category of participants`)
prop.table(table(study_dat$`Risk category of participants`)) * 100

# Outcome definitions
study_dat |>
  count(`Outcome definition`, sort = T) |>
  print(n = Inf)
def <- study_dat$`Outcome definition`[grepl("blood|CSF|spinal|culture", study_dat$`Outcome definition`, ignore.case = T)]
length(def)
# n=75 contain positive blood or CSF culture in definition
75 / 82 * 100
length(def[grepl("clinical|suspicion|symptom", def, ignore.case = T)])
# n=18 of these include clinical features or suspicion of sepsis
18 / 82 * 100

# Characteristics of models ----

# Extract model evaluation data
model_vars <- c(
  "First author",
  "Year",
  "Name of prediction model",
  "Distinct scores",
  "Timing of sepsis",
  "Modelling method",
  "Outcome definition",
  "Risk category of participants",
  "Type of model",
  "Sensitivity",
  "Specificity",
  "Positive predictive value",
  "Negative predictive value",
  "AUC",
  "Other measure of model performance",
  "Acceptability",
  "Mortality",
  "Antibiotic prescriptions",
  "Any other factors relating to performance"
)
evals_dat <- dat |> select(!!model_vars)
evals_dat

# n evaluations
# model has been evaluated if any of these vars non-missing...
eval_vars <- model_vars[(length(model_vars) - 9):length(model_vars)]
evals_dat <- evals_dat |>
  rowwise() |>
  mutate(eval = if_else(all(is.na(across(all_of(eval_vars)))), 0, 1)) |>
  ungroup()
table(evals_dat$eval)
# the above steps should be redundant; only studies with these data included
# but, useful check to ensure Excel format is correct
sum(evals_dat$eval == 1)

# Distinct models
# Read in distinct model data sheet from Excel file
model_dat <- read_xlsx(path, sheet = "Distinct models", na = "NA")
# View(model_dat)
# n distinct models
nrow(model_dat) ## 44
# should equal n unique entries in `Distinct scores` col of `Studies` sheet & evals_dat
# since distinct model name only recorded once per study, even if evaluated multiple times per study
# e.g., evals_dat |> select("First author", Year, "Distinct scores") |> View()
evals_dat |>
  drop_na(`Distinct scores`) |>
  pull(`Distinct scores`) |>
  unique() |>
  length() ## also 44; confirms Excel formatting is correct

# n studies per model
# n.b. _studies_, not evaluations (a study can evaluate one model multiple times)
evals_dat |>
  drop_na(`Distinct scores`) |>
  count(`Distinct scores`, sort = T) |>
  print(n = Inf)
32 / 82 * 100
# models validated in only one study
evals_dat |>
  drop_na(`Distinct scores`) |>
  count(`Distinct scores`, sort = T) |>
  filter(n == 1) |>
  nrow()
34 / 44 * 100

# Predictors
# Some predictor columns are empty, so drop these...
dim(model_dat)
model_dat <- model_dat |> select(where(~ !(all(is.na(.)))))
dim(model_dat)

predictors <- names(model_dat |> select(-(1:6)))
lab_predictors <- predictors[1:53]
clinical_predictors <- predictors[54:length(predictors)]
length(predictors) ##
length(lab_predictors) ##
length(clinical_predictors) ##

# n predictors per model
hist(model_dat$`Number of variables`, breaks = 20) # non-normal
median(model_dat$`Number of variables`)
range(model_dat$`Number of variables`)
IQR(model_dat$`Number of variables`)

# Clinical, lab or both
table(model_dat$`Type of model`)
prop.table(table(model_dat$`Type of model`)) * 100

# Frequency of each predictor
# Long format
model_dat_long <- model_dat |>
  select(-c("Original author", "Modelling method", "Outcome", "Type of model",
            "Number of variables")) |>
  pivot_longer(cols = -c("Model"))
model_dat_long

predfreq <- model_dat_long |>
  group_by(name) |>
  summarise(freq = sum(value, na.rm = T)) |>
  # arrange(desc(freq)) |>
  ungroup()
predfreq
# write_csv(predfreq, here("DataExtraction/pred_freq_table.csv"))

# Modelling method
model_dat |> count(`Modelling method`, sort = T)
16 / 44 * 100
10 / 44 * 100

# Model performance ----

# Performance reporting
# Don't report either sens or spec...
sum(is.na(evals_dat$Sensitivity & is.na(evals_dat$Specificity)))
4 / 82 * 100
# Report other metrics...
eval_vars
sum(!is.na(evals_dat$AUC))
32 / 82 * 100
sum(!is.na(evals_dat$Mortality))
sum(!is.na(evals_dat$`Antibiotic prescriptions`))
# evals_dat$`Other measure of model performance`
# evals_dat$`Any other factors relating to performance`

# Overall
evals_dat |> 
  summarise(
    median_sens = median(Sensitivity, na.rm = T) * 100,
    min_sens = min(Sensitivity, na.rm = T) * 100,
    max_sens = max(Sensitivity, na.rm = T) * 100,
    median_spec = median(Specificity, na.rm = T) * 100,
    min_spec = min(Specificity, na.rm = T) * 100,
    max_spec = max(Specificity, na.rm = T) * 100
  )

# Clinical signs vs. lab tests
evals_dat |>
  group_by(`Type of model`) |>
  summarise(
    n_evals = n(),
    median_sens = median(Sensitivity, na.rm = T) * 100,
    min_sens = min(Sensitivity, na.rm = T) * 100,
    max_sens = max(Sensitivity, na.rm = T) * 100,
    median_spec = median(Specificity, na.rm = T) * 100,
    min_spec = min(Specificity, na.rm = T) * 100,
    max_spec = max(Specificity, na.rm = T) * 100
  )

# Target population
# 'High risk' = existing clinical suspicion of sepsis
evals_dat |>
  group_by(`Risk category of participants`) |>
  summarise(
    n_evals = n(),
    median_sens = median(Sensitivity, na.rm = T) * 100,
    min_sens = min(Sensitivity, na.rm = T) * 100,
    max_sens = max(Sensitivity, na.rm = T) * 100,
    median_spec = median(Specificity, na.rm = T) * 100,
    min_spec = min(Specificity, na.rm = T) * 100,
    max_spec = max(Specificity, na.rm = T) * 100
  )

# Timing of sepsis
evals_dat |>
  group_by(`Timing of sepsis`) |>
  summarise(
    n_evals = n(),
    median_sens = median(Sensitivity, na.rm = T) * 100,
    min_sens = min(Sensitivity, na.rm = T) * 100,
    max_sens = max(Sensitivity, na.rm = T) * 100,
    median_spec = median(Specificity, na.rm = T) * 100,
    min_spec = min(Specificity, na.rm = T) * 100,
    max_spec = max(Specificity, na.rm = T) * 100
  )

# Figures ----

# Publications over time

pal_lancet("lanonc")(9)

pub_fig <- dat |>
  drop_na(Year) |>
  count(Year) |>
  ggplot(aes(x = Year, y = n)) +
  geom_col(fill = "#00468BFF") +
  geom_smooth(method = "loess", se = FALSE, colour = "#ED0000FF") +
  scale_x_continuous(breaks = seq(2003, 2024, 3)) +
  scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12), expand = c(0, 0)) +
  labs(x = "Year of publication", y = "Number of publications") +
  theme_bw() +
  NULL
pub_fig
fig_path <- here("DataExtraction/pub_fig.png")
ggsave(fig_path, pub_fig, width = 4, height = 4)

# PRISMA ----

# Databases and registers
## Records identified = 8353 (from Rayyan)
## Records screened = 4598 (from Rayyan)

## Reports sought for retrieval (i.e. full texts screened)
prisma_path <- here("Screening/NeoSepModsReview_FullTextDecisions.csv")
prisma_dat <- read_csv(prisma_path, na = "NA")
database_dat <- prisma_dat |> filter(source == "database")
nrow(database_dat) ##

## Reports not retrieved
sum(database_dat$reason == "No full text", na.rm = T) ##

## Reports assessed for eligibility
nrow(database_dat) - sum(database_dat$reason == "No full text", na.rm = T) ##

## Reasons for exclusion
database_dat <- database_dat |> filter(decision == "Exclude" & reason != "No full text")
nrow(database_dat) ##
table(database_dat$reason, useNA = "always")

# Other methods
other_dat <- prisma_dat |> filter(source == "other")
nrow(other_dat) ##

## Reports not retrieved
sum(other_dat$reason == "No full text", na.rm = T) ##
sum(other_dat$reason == "Unable to find", na.rm = T) ##
nrow(other_dat) - sum(other_dat$reason == "No full text", na.rm = T) - sum(other_dat$reason == "Unable to find", na.rm = T) ##

## Reasons for exclusion
other_dat <- other_dat |> filter(decision == "Exclude" & reason != "No full text" & reason != "Unable to find")
nrow(other_dat) ##
table(other_dat$reason, useNA = "always")