## Load packages and set the default theme for our plots
library(car)
library(pastecs)
library(tidyverse)

theme_set(theme_bw())

## Load the Microbiome_Data.RData file by giving the path to the file
load()


# Section 1: Viewing Normality

## Histogram with Normal Distribution

### Make a histogram with the following properties:
#- X axis: 40 bins of `Fir` concentrations
#- Y axis: **density** of the bins (done by setting `y = ..density..`)
#- black outlines for the bins (`color` aesthetic)
#- white fill for bins

# Add a normal distribution with the following properties:
#- mean and standard deviation of `Fir` microbe
#- red color

ggplot(mb,aes()) +
  geom_histogram(aes(), bins = , fill = , color = ) + # Makes the histogram
  stat_function(fun = , args = list(), color = ) # makes the normal distribution

# 1. How do we determine if the distribution is skewed?
# 2. Does the distribution look normal or skewed?
  
### Make a histogram for the Actin microbe with the same properties



# 3. Does the `Actin` distribution look normal or skewed?


## Q-Q Plots
### Make a Q-Q plot showing the distribution of 'Fir' along with a unity line. Q-Q plots use the 'sample' aesthetic as opposed to X or Y
ggplot(mb,aes()) + 
  geom_qq() +
  geom_qq_line()

# 1. Based on the QQ plot, do you think Fir is skewed? 

###. Show the Q-Q plot for the 'Actin' microbe


# 2. Based on the QQ plot, do you think Actin is skewed?


### Make Q-Q plots for each microbe in one ggplot call. You will need to first gather the data the where microbe labels are all in a single column
mb.g <- gather(mb, key = , value = , )

ggplot(mb.g, aes(sample = , color = )) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap( , scales = "free")


## Boxplots
### Use `geom_boxplot` to create a boxplot showing the distribution of `Fir`. geom_boxplot uses the 'y' aesthetic
ggplot(mb, aes()) +
  geom_boxplot()

# 1. Does the boxplot show `Fir` to be normally distributed?
# 2. What else do boxplots show us that histograms and q-q plots have trouble showing?


### To show a different distribution, plot the distribution of 'Actin' using a bloxplot



### Show boxplots for all microbes on the same plot using a single ggplot call
ggplot(mb.g, aes()) +
  geom_boxplot()



### As opposed to showing them all on the same plot, create multiple plots for the microbes using 'facet_wrap'



# Section 2: Statistical Tests for Normality
## Kolmogorov-Smirnov Test

# This test is performed using the `ks.test` function. The necessary inputs for a one-sample test are:
# `x`: your data
# `y`: the normal probability function (defined by `pnorm`)
# `mean` and `sd`: used to define `pnorm`

### 1. Compare 'Fir' to a normal distribution using a KS test
ks.test(x = , y = , mean = , sd = )

### 2. Compare 'Actin' to a normal distribution using a KS test

## Shapiro-Wilks Test
# This test is performed using the `shapiro.test` function. It only takes one input ever, your distribution, and does a similar process as the KS test. 

### 1. Compare 'Fir' to a normal distribution using the Shapiro-Wilks test
shapiro.test()

### 2. Compare 'Actin' to a normal distribution using the Shapiro-Wilks Test


# Section 3. Calculating Skew and Kurtosis
# In order to calculate skew and kurtosis, we will use the `stat.desc` function from the `pastecs` package.
# `stat.desc` has the following inputs:
  
# - `x`: the distribution to describe
# - `basic`: logical (T/F) value saying whether to output basic statistics. True by default
# - `desc`: logical (T/F) value saying whether to output descriptive statistics. True by default
# - `norm`: logical (T/F) value saying whether to output normal distribution statistics. False by default
# - `p`: probability level to calculate confidence interval on. 0.95 by default

## Calculate skew and kurtosis for 'Fir'
stat.desc()

## Calculate these measures for all microbes at the same time using the 'select' function to input multiple columns
stat.desc(select())

# Section 4. Homoscedasticity and Levene's Test
# Homoscedasticity means that there are equal variances across groups or across a sample. We will use the leveneTest function from the `car` package.
# The leveneTest function uses a formula as an input. Formulas represent an expression for R to parse. It is usually given as:
# dependent ~ inpedendent

## Test homoscedasticity of Bacter as function of Sex
leveneTest(y = , data = )

## Test Verru as a function of ACF


# Section 5. Comparing Means Across Multiple Groups
# Before getting into this section, we need to load a larger version of this dataset stored in "Microbe_Data_Full.RData"
load()

## Using the Raw Data method from the last lab, compare the mean and standard error of the concentration of each microbe across Sexes

### First we will need to gather the microbe concentrations into the same column
mb.g <- gather()

### Make a graph with the following properties:
# `ggplot` properties:
# - `data`: rows that have either Male or Female sex, not NA values. We can remove these rows with `filter` combined with `!is.na`.
# - `x`: Microbe type
# - `y`: Value
# - `fill`: Sex. We will be changing the fill color dependent on the values in Sex

# `geom_bar` properties:
# - The bar graph should plot the mean
# - The bars for the different groups should be side by side as opposed to stacked

# `stat_summary` properties:
# - Use the "errorbar" geom
# - Bars should represent the SEM
# - Errorbars need to be on top of the bar they correspond with (position dodged along with the bar graph)
# - Errorbars should have a width of 0.2

ggplot(data = , aes()) +
  geom_bar(stat = , fun.y = , position = ) +
  stat_summary(fun.data = , geom = , position = , width = )


### Make another plot with the same properties but across ACF instead of Sex



### Make a plot that splits across both Sex and ACF. You will need to facet, choose which one you will fill by and which one you will facet by