Skip to content
Snippets Groups Projects
cleanMicrobiome.R 2.06 KiB
# This script is used to reproducibly clean the flora and pheno files
pheno <- read.csv("~/Google Drive/Grad School/GR 770 Statistics/R Labs/Data/Pheno.csv") %>% select(-starts_with("X"))
flora <- read.csv("~/Google Drive/Grad School/GR 770 Statistics/R Labs/Data/Flora.csv") %>% select(-starts_with("X"))

# Join into a single df adding columns together, assuming subjects were added in the same order
mb <- cbind(pheno,flora)

# Select columns of relative interest.
mb <- select(mb, AGE_YEARS, SEX, ALCOHOL_FREQUENCY,WEIGHT_KG,contains("k__"))

# All variables besides Microbe concentrations are factors with absurd levels. Need to fix this on an individual basis. Filter out rows with unreasonable responses
mb <- mb %>%
  mutate(AGE_YEARS = as.numeric(levels(AGE_YEARS))[AGE_YEARS],
         WEIGHT_KG = as.numeric(levels(WEIGHT_KG))[WEIGHT_KG]) %>%
  filter(AGE_YEARS >= 10, between(WEIGHT_KG,20,250))

# Change Sex designations other than Female or Male to NA
mb$SEX[mb$SEX != c("female","male")] = "NA"
mb$SEX <- droplevels(mb$SEX)

# Remove bothersome tags from the column names
colnames(mb) <- mb %>% names() %>% 
  str_remove("k__Bacteria.p__") %>% 
  str_remove("k__Archaea.p__")

# Select specific microbes
mb <- mb %>%
  select(AGE_YEARS, SEX,ALCOHOL_FREQUENCY,WEIGHT_KG, Firmicutes, Bacteroidetes, Actinobacteria, Verrucomicrobia, Proteobacteria)


# Change the levels for Alcohol Consumption. Make a short variable first, then paste it back in later
acf <- mb$ALCOHOL_FREQUENCY

acf[!str_detect(acf,c("Rare","Occa","Regu","Never","Daily"))] = "NA"
acf <- droplevels(acf)
acf <- data.frame(acf = acf)
acf <- acf %>%
  mutate(acf = fct_recode(acf, "Rarely" = "Rarely (a few times/month)",
                               "Regularly" = "Regularly (3-5 times/week)",
                               "Occasionally" = "Occasionally (1-2 times/week)"))

mb$ALCOHOL_FREQUENCY = acf$acf


# Change column names
colnames(mb) <- c("Age","Sex","ACF","Weight.kg","Fir","Bacter","Actin","Verru","Proteo")

# filter out samples with 0 in any of the microbes
mb <- mb %>% filter_if(is.numeric, all_vars(. != 0))