Reads in a Parquet file as a tibble and converts it from long to wide format to prepare it for ML.
loadMLInputTibble(parquet_path)chr Path to a long-format sparse Parquet file from upstream processing.
A tibble (one row per genome, one column per feature), plus "Resistant" or "Susceptible" AMR phenotype labels
long <- tibble::tibble(
genome_id = rep(paste0("g", 1:6), each = 2),
feature_id = rep(c("gene_a", "gene_b"), 6),
value = c(1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1),
genome_drug.resistant_phenotype = rep(
rep(c("Resistant", "Susceptible"), each = 3),
each = 2
)
)
tmp <- tempfile(fileext = ".parquet")
arrow::write_parquet(long, tmp)
loadMLInputTibble(tmp)
#> # A tibble: 6 × 4
#> genome_id genome_drug.resistant_phenotype gene_a gene_b
#> <chr> <fct> <dbl> <dbl>
#> 1 g1 Resistant 1 0
#> 2 g2 Resistant 0 1
#> 3 g3 Resistant 1 1
#> 4 g4 Susceptible 0 0
#> 5 g5 Susceptible 1 0
#> 6 g6 Susceptible 0 1