### R script - microbial community analysis ### Verburg et al. ###Effects of clinical wastewater on the bacterial community structure along a wastewater pathway ### ################################ DADA2 error correction of quality-filtered sequences ################################ ##Sample inference script: ## follow script: https://benjjneb.github.io/dada2/bigdata_paired.html) library(dada2); packageVersion("dada2") # File parsing of raw sequence data filtpathF <- "~/path-to-forward-reads/" # fwd reads filtpathR <- "~/path-to-reverse-reads/" # rev reads filtFs <- list.files(filtpathF, pattern="fastq", full.names = TRUE) filtRs <- list.files(filtpathR, pattern="fastq", full.names = TRUE) sample.names <- sapply(strsplit(basename(filtFs), "_"), `[`, 1) # Assumes filename = samplename_XXX.fastq.gz sample.namesR <- sapply(strsplit(basename(filtRs), "_"), `[`, 1) # Assumes filename = samplename_XXX.fastq.gz if(!identical(sample.names, sample.namesR)) stop("Forward and reverse files do not match.") names(filtFs) <- sample.names names(filtRs) <- sample.names set.seed(100) # Learn forward error rates errF <- learnErrors(filtFs, nbases=1e8, multithread=FALSE, MAX_CONSIST = 20) # Learn reverse error rates errR <- learnErrors(filtRs, nbases=1e8, multithread=FALSE, MAX_CONSIST = 20) # Sample inference and merger of paired-end reads mergers <- vector("list", length(sample.names)) names(mergers) <- sample.names for(sam in sample.names) { cat("Processing:", sam, "\n") derepF <- derepFastq(filtFs[[sam]]) ddF <- dada(derepF, err=errF, multithread=FALSE) derepR <- derepFastq(filtRs[[sam]]) ddR <- dada(derepR, err=errR, multithread=FALSE) merger <- mergePairs(ddF, derepF, ddR, derepR) mergers[[sam]] <- merger } rm(derepF); rm(derepR) # Construct sequence table and remove chimeras seqtab <- makeSequenceTable(mergers) saveRDS(seqtab, "~/path-to-output/seqtab.rds") # CHANGE ME to where you want sequence table saved ##Chimera/Taxonomy script: library(dada2); packageVersion("dada2") # Remove chimeras seqtab2 <- removeBimeraDenovo(seqtab, method="consensus", multithread=FALSE) saveRDS(seqtab2, "~/path-to-output/seqtab.final.rds") seqtab.nochim <- readRDS("seqtab_final.rds") # download SILVA training set in a folder named Training taxa_final <- assignTaxonomy(seqtab.nochim, "~/path-to-training/silva_nr_v132_train_set.fa.gz", multithread = TRUE) taxa_final_spec <- addSpecies(taxa_final, "~/path-to-training/silva_species_assignment_v132.fa.gz") saveRDS(taxa_final_spec, "~/path-to-output/taxa_final.rds") write.table(t(seqtab.nochim), "seqtab-nochim_final_IVER.txt", sep="\t", row.names=TRUE, col.names=NA, quote=FALSE) uniquesToFasta(seqtab.nochim, fout='rep-seqs_final_IVER.fna', ids=colnames(seqtab.nochim)) ################################ Create phyloseq object ################################ ### create phyloseq object ### source('http://bioconductor.org/biocLite.R') BioClite(phyloseq) library(phyloseq) library(Biostrings) rep_seqs <- readDNAStringSet("~/path-to-output/rep-seqs_final_IVER.fna", format="fasta", nrec=-1L, skip=0L, seek.first.rec=FALSE, use.names=TRUE) str(rep_seqs) # combine biom_table in txt format with taxonomic information setdiff(rownames(taxa_final), rownames(t(seqtab_nochim_final))) taxa_final_spec2 <- subset(taxa_final_spec, rownames(taxa_final_spec) %in% rownames(t(seqtab_nochim))) identical(rownames(taxa_final_spec2), colnames(seqtab_nochim)) # TRUE # write taxa table in txt write.table(as.data.frame(taxa_final_spec2),"~/path-to-output/taxa_final_spec2.txt", sep = "\t", dec = ".") # go to folder with seqtab-nochim.txt in command line # Add a special header for BIOM: echo -n "#OTU Table" | cat - seqtab-nochim.txt > biom-table.txt # add taxonomic information as last column ('taxonomy') in format Bacteria;Proteobacteria;Alphaproteobacteria;etc # Chance A1 cell to #OTU ID ################################ QIIME2 in command line ################################ ## open QIIME2: module spider QIIME # copy desired QIIME version module load QIIME2/2018.11 #open QIIME2 qiime ## For the feature-table, there are two steps we have to do first: # Add a special header for BIOM: echo -n "#OTU Table" | cat - seqtab-nochim.txt > biom-table.txt # add taxonomic information as last column ('taxonomy') in format Bacteria;Proteobacteria;Alphaproteobacteria;etc # Chance A1 cell to #OTU ID # Convert to BIOM v2.1: biom convert -i biom-table_final_IVER_taxAdded.txt -o biom-table_final_IVER_taxAdded.biom --table-type="OTU table" --to-hdf5 # import sequences (rep seqs) qiime tools import \ --input-path rep-seqs_final_IVER.fna \ --type 'FeatureData[Sequence]' \ --output-path rep-seqs_final_IVER.qza # Now we can import that as well: qiime tools import \ --input-path table.biom \ --type 'FeatureTable[Frequency]' \ --source-format BIOMV210Format \ --output-path table_ASVs.qza ### Generate a tree for phylogenetic diversity analyses ### ## QIIME supports several phylogenetic diversity metrics, including Faith’s Phylogenetic Diversity and ## weighted and unweighted UniFrac. In addition to counts of features per sample (i.e., the data in the ## FeatureTable[Frequency] QIIME 2 artifact), these metrics require a rooted phylogenetic tree relating ## the features to one another. This information will be stored in a Phylogeny[Rooted] QIIME 2 artifact. ## The following steps will generate this QIIME 2 artifact. ## First, we perform a multiple sequence alignment of the sequences in our FeatureData[Sequence] to create ## a FeatureData[AlignedSequence] QIIME 2 artifact. Here we do this with the mafft program. # follow: https://docs.qiime2.org/2017.9/tutorials/moving-pictures/ qiime alignment mafft \ --i-sequences rep-seqs_final_IVER.qza \ --o-alignment aligned_rep-seqs_final_IVER.qza ## Next, we mask (or filter) the alignment to remove positions that are highly variable. These positions ## are generally considered to add noise to a resulting phylogenetic tree. qiime alignment mask \ --i-alignment aligned_rep-seqs_final_IVER.qza \ --o-masked-alignment masked-aligned_rep-seqs_final_IVER.qza ## Next, we’ll apply FastTree to generate a phylogenetic tree from the masked alignment. qiime phylogeny fasttree \ --i-alignment masked-aligned_rep-seqs_final_IVER.qza \ --o-tree IVER_final_unrooted-tree.qza ## The FastTree program creates an unrooted tree, so in the final step in this section we apply midpoint ## rooting to place the root of the tree at the midpoint of the longest tip-to-tip distance in the unrooted tree. qiime phylogeny midpoint-root \ --i-tree IVER_final_unrooted-tree.qza \ --o-rooted-tree IVER_final_rooted-tree.qza mkdir qza_to_physeq cp IVER_final_rooted-tree.qza ./qza_to_physeq cp masked-aligned_rep-seqs_final_IVER.qza ./qza_to_physeq cd qza_to_physeq/ # export qza files qiime tools export \ --input-path masked-aligned_rep-seqs_final_IVER.qza \ --output-path aligned-seqs #qiime tools export \ # --input-path table_6314A.qza \ # --output-path exported-table qiime tools export \ --input-path IVER_final_rooted-tree.qza \ --output-path exported-tree # Convert to BIOM v2.1: biom convert -i biom-table_final_IVER_taxAdded.txt -o biom-table_final_IVER_taxAdded.biom --table-type="OTU table" --to-hdf5 # add new file to ~/path-to-output/ ################################ Back in R -- create phyloseq object ################################ library(phyloseq) # load biom formatted OTU table biom_tax <- import_biom("~/path-to-output/biom-table_final_IVER_taxAdded.biom", parseFunction = parse_taxonomy_default) # load phylogenetic tree, and check whether the tree is rooted library(ape) tree_IVER <- read_tree("~/path-to-output/exported-tree/tree.nwk") is.rooted(tree_IVER) # create aspects of physeq object mapping <- import_qiime_sample_data("~/path-to-output/sample-metadata.txt") ASV_table <- tax_table(biom_tax) phy_tree <- phy_tree(tree_IVER) features <- otu_table(biom_tax) ## OTU names are still full sequences ## now change the names to ASV_### before merging objects into a single phyloseq object # change taxa names (feature names to "ASV_#") ASV_names_IVER <- c(sprintf("ASV_%d", 1:length(taxa_names(ASV_table)))) taxa_names(features) <- ASV_names_IVER taxa_names(ASV_table) <- ASV_names_IVER taxa_names(phy_tree) <- ASV_names_IVER # is the phylogenetic tree binary? is.binary.tree(phy_tree) # TRUE (is fine) # FALSE # resolve polychotomous nodes phy_tree_resolved <- multi2di(phy_tree) is.binary.tree(phy_tree_resolved) # create new phy_tree tree2 <- phy_tree_resolved phy_tree2 <- phy_tree(phy_tree_resolved) # Combine mapping file, otu_table, taxonomic information, phylogenetic tree into a single phyloseq object psdata_IVER <- merge_phyloseq(features,ASV_table,phy_tree2) saveRDS(psdata_IVER, "phyloseq_IVER.rds") # make dataframe where new ASV names are linked to their respective representative sequences # to align ASV names with sequence names in tax_table check_tax_IVER <- data.frame(tax_table(biom_tax)@.Data) check_tax_IVER$sequence <- rownames(check_tax_IVER) merged_ASV_taxtable_IVER <- data.frame(cbind(ASV_names_IVER, check_tax_IVER)) colnames(merged_ASV_taxtable_IVER) <- c("ASV_names_IVER", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "representative_sequence") rownames(merged_ASV_taxtable_IVER) <- merged_ASV_taxtable_IVER$ASV_names_IVER # write table with ASV names, taxonomic information and ASV representative sequence write.table(merged_ASV_taxtable_IVER, "ASV_taxonomic_information_AND_rep_seq.txt", sep = "\t", dec = ".") ## remove (potential) non-bacterial ASVs colnames(tax_table(psdata_IVER)) <- c(k="Kingdom", p="Phylum", c="Class", o="Order", f="Family", g="Genus", s="Species") psdata_IVER_cleaned <- subset_taxa(psdata_IVER, Kingdom == "Bacteria") # filter out any ASVs assigned to Archaea psdata_IVER_cleaned <- subset_taxa(psdata_IVER_cleaned, Class != "Chloroplast") # filter out any ASVs assigned to Chloroplast (plant endosymbionts) psdata_IVER_cleaned <- subset_taxa(psdata_IVER_cleaned, Family != "Mitochondria") # filter out any ASVs assigned to Mitochondria (eukaryotic endosymbionts) psdata_IVER_cleaned # check tree for weird outliers ape::write.tree(phy_tree(psdata_IVER_cleaned), "/Users/pietervanveelen/Google Drive/Rdata backups/Wetsus Projects/IVER/exported-tree/phyloseq_cleaned_tree.nwk") # remove NAs from Site vector in psdata_IVER_cleaned psdata_IVER_cleaned_noNA <- subset_samples(psdata_IVER_cleaned, Site != "S1") psdata_IVER_cleaned_noNA <- subset_samples(psdata_IVER_cleaned_noNA, Site != "S2") saveRDS(psdata_IVER_cleaned_noNA, "psdata_IVER_cleaned_noNA.RDS") ################################ R -- Beta diversity analysis ################################ ### All samples input psdata_IVER_cleaned_noNA # normalise read counts using TSS psdata_IVER_cleaned_noNA_rel <- transform_sample_counts(psdata_IVER_cleaned_noNA, function(x) x/sum(x)) sample_data(psdata_IVER_cleaned_noNA_rel)$Site <- factor(sample_data(psdata_IVER_cleaned_noNA_rel)$Site, levels = c("H", "N", "C","I", "E", "up", "down", "control")) # data frame for stats df_psdata_IVER_cleaned_noNA_rel <- as(sample_data(psdata_IVER_cleaned_noNA_rel), "data.frame") # distance matrices all_nona_bray <- vegdist(t(otu_table(psdata_IVER_cleaned_noNA_rel)), binary = F, na.rm = F, method = "bray") all_nona_uu <- UniFrac(psdata_IVER_cleaned_noNA_rel) all_nona_wu <- UniFrac(psdata_IVER_cleaned_noNA_rel, weighted = T, normalized = T) # Statistical differences between Sites without discharged water samples # constrained analysis (db-redundancy analysis using the adonis2 function) adonis_all_nona_bray <- adonis2(all_nona_bray ~Site, permutations = 9999, df_psdata_IVER_cleaned_noNA_rel) adonis_all_nona_uu <- adonis2(all_nona_uu ~Site, permutations = 9999, df_psdata_IVER_cleaned_noNA_rel) adonis_all_nona_wu <- adonis2(all_nona_wu ~Site, permutations = 9999, df_psdata_IVER_cleaned_noNA_rel) # ordinations # PCOA ord_all_nona_PCoA_bray <- ordinate(psdata_IVER_cleaned_noNA_rel, method = "PCoA", distance = "bray") ord_all_nona_PCoA_uu <- ordinate(psdata_IVER_cleaned_noNA_rel, method = "PCoA", distance = "uunifrac") ord_all_nona_PCoA_wu <- ordinate(psdata_IVER_cleaned_noNA_rel, method = "PCoA", distance = "wunifrac",normalized = T) # plot ordinations all samples pdf("Betadiversity_weighted_UniFrac_allSAamples_pcoa.pdf", useDingbats = F, height = 4, width = 6) beta_wu_WWTP_pcoa <- plot_ordination(psdata_IVER_cleaned_noNA_rel, ord_all_nona_PCoA_wu, axes = c(1,2), color = "Site") wu_plot_all <- beta_wu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA weighted UniFrac") dev.off() pdf("Betadiversity_unweighted_UniFrac_allSAamples_pcoa.pdf", useDingbats = F, height = 4, width = 6) beta_uu_WWTP_pcoa <- plot_ordination(psdata_IVER_cleaned_noNA_rel, ord_all_nona_PCoA_uu, axes = c(1,2), color = "Site") uu_plot_all <- beta_uu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA unweighted UniFrac") dev.off() pdf("Betadiversity_unweighted_AND_weighted_UniFrac_allSAamples_pcoa.pdf", useDingbats = F, height = 4, width = 10) gridExtra::grid.arrange(ncol = 2,uu_plot_all ,wu_plot_all) dev.off() # stats global_permanova_uu_allSites <- adonis(all_nona_uu~Site, data = df_psdata_IVER_cleaned_noNA_rel, permutations = 999) global_permanova_wu_allSites <- adonis(all_nona_wu~Site, data = df_psdata_IVER_cleaned_noNA_rel, permutations = 999) capture.output(global_permanova_uu_allSites, file = "Stats_global_permanova_unweighted-unifrac_allSites.txt") capture.output(global_permanova_wu_allSites, file = "Stats_global_permanova_weighted-unifrac_allSites.txt") pairwise_permanova_uu_allSites <- pairwise.perm.manova(all_nona_uu, df_psdata_IVER_cleaned_noNA_rel$Site, progress = T) pairwise_permanova_wu_allSites <- pairwise.perm.manova(all_nona_wu, df_psdata_IVER_cleaned_noNA_rel$Site, progress = T) capture.output(pairwise_permanova_uu_allSites, file = "Stats_pairwise_permanova_unweighted-unifrac_allSites.txt") capture.output(pairwise_permanova_wu_allSites, file = "Stats_pairwise_permanova_weighted-unifrac_allSites.txt") pcoa_allSamples_betadisper_uu <- permutest(betadisper(all_nona_uu, df_psdata_IVER_cleaned_noNA_rel$Site, type = "median"), pairwise = TRUE) pcoa_allSamples_betadisper_wu <- permutest(betadisper(all_nona_wu, df_psdata_IVER_cleaned_noNA_rel$Site, type = "median"), pairwise = TRUE) capture.output(pcoa_allSamples_betadisper_uu, file = "Stats_pairwise_Betadisper_unweighted-unifrac_allSites.txt") capture.output(pcoa_allSamples_betadisper_wu, file = "Stats_pairwise_Betadisper_weighted-unifrac_allSites.txt") ### Non-river samples only #input psdata_IVER_cleaned_noNA # Range coverage: summary(sample_sums(psdata_IVER_cleaned_noNA)) # remove surface waters psdata_IVER_WWTP <- subset_samples(psdata_IVER_cleaned_noNA, Site != "up" & Site != "down" & Site !="control") psdata_IVER_WWTP_rel <- transform_sample_counts(psdata_IVER_WWTP, function(x) x/sum(x)) sample_data(psdata_IVER_WWTP_rel)$Site <- factor(sample_data(psdata_IVER_WWTP_rel)$Site, levels = c("H", "N", "C","I", "E")) # Check content droplevels(sample_data(psdata_IVER_WWTP_rel)$Site) psdata_IVER_WWTP_rel df_psdata_IVER_WWTP_rel <- as(sample_data(psdata_IVER_WWTP_rel), "data.frame") # distance matrices WWTP_bray <- vegdist(t(otu_table(psdata_IVER_WWTP_rel)), binary = F, na.rm = F, method = "bray") WWTP_uu <- UniFrac(psdata_IVER_WWTP_rel) WWTP_wu <- UniFrac(psdata_IVER_WWTP_rel, weighted = T, normalized = T) # Statistical differences between Sites without discharged water samples # constrained analysis (db-redundancy analysis using the adonis2 function) adonis_WWTP_bray <- adonis2(WWTP_bray ~Site, permutations = 9999, df_psdata_IVER_WWTP_rel) adonis_WWTP_uu <- adonis2(WWTP_uu ~Site, permutations = 9999, df_psdata_IVER_WWTP_rel) adonis_WWTP_wu <- adonis2(WWTP_wu ~Site, permutations = 9999, df_psdata_IVER_WWTP_rel) uu_site_global <- adonis(WWTP_uu~Site, permutations = 999, data = df_psdata_IVER_WWTP_rel) wu_site_global <- adonis(WWTP_wu~Site, permutations = 999, data = df_psdata_IVER_WWTP_rel) capture.output(uu_site_global, file = "unweighted_unifrac_site_permanova_NoSurfaceWaters.txt") capture.output(wu_site_global, file = "weighted_unifrac_site_permanova_NoSurfaceWaters.txt") # pairwise site comparisons unweighted and weighted unifrac BiocManager::install("RVAideMemoire") library(RVAideMemoire) uu_site_pairwise <- pairwise.perm.manova(WWTP_uu, df_psdata_IVER_WWTP_rel$Site) wu_site_pairwise <- pairwise.perm.manova(WWTP_wu, df_psdata_IVER_WWTP_rel$Site) capture.output(uu_site_pairwise, file = "unweighted_unifrac_site_pairwise_permanova_NoSurfaceWaters.txt") capture.output(wu_site_pairwise, file = "weighted_unifrac_site_pairwise_permanova_NoSurfaceWaters.txt") # ordinations # PCOA ord_WWTP_PCoA_bray <- ordinate(psdata_IVER_WWTP_rel, method = "PCoA", distance = "bray") ord_WWTP_PCoA_uu <- ordinate(psdata_IVER_WWTP_rel, method = "PCoA", distance = "uunifrac") ord_WWTP_PCoA_wu <- ordinate(psdata_IVER_WWTP_rel, method = "PCoA", distance = "wunifrac",normalized = T) #NMDS ord_WWTP_NMDS_bray <- ordinate(psdata_IVER_WWTP_rel, method = "NMDS", distance = "bray") ord_WWTP_NMDS_uu <- ordinate(psdata_IVER_WWTP_rel, method = "NMDS", distance = "uunifrac") ord_WWTP_NMDS_wu <- ordinate(psdata_IVER_WWTP_rel, method = "NMDS", distance = "wunifrac",normalized = T) # group dispersions library(vegan) #pcoa pcoa_WWTP_betadisper_bray <- permutest(betadisper(WWTP_bray, df_psdata_IVER_WWTP_rel$Site, type = "median"), pairwise = TRUE) pcoa_WWTP_betadisper_uu <- permutest(betadisper(WWTP_uu, df_psdata_IVER_WWTP_rel$Site, type = "median"), pairwise = TRUE) pcoa_WWTP_betadisper_wu <- permutest(betadisper(WWTP_wu, df_psdata_IVER_WWTP_rel$Site, type = "median"), pairwise = TRUE) capture.output(pcoa_WWTP_betadisper_uu, file = "Stats_Betadisper_pcoa_WWTP_betadisper_unweighted-unifrac.txt") capture.output(pcoa_WWTP_betadisper_wu, file = "Stats_Betadisper_pcoa_WWTP_betadisper_weighted-unifrac.txt") # plot ordinations pcoa_data_WWTP_uu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_uu, "Samples", justDF=F) pcoa_data_WWTP_wu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_wu, "Samples", justDF=F) pdf("Betadiversity_unweighted_UniFrac_WWTP-SAamples_pcoa.pdf", useDingbats = F, height = 4, width = 5) beta_uu_WWTP_pcoa <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_uu, axes = c(1,2), color = "Site") uu_plot_WWTP <- beta_uu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA unweighted UniFrac WWTP") uu_plot_WWTP dev.off() pdf("Betadiversity_weighted_UniFrac_WWTP-SAamples_pcoa.pdf", useDingbats = F, height = 4, width = 5) beta_wu_WWTP_pcoa <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_wu, axes = c(1,2), color = "Site") wu_plot_WWTP <- beta_wu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA weighted UniFrac WWTP") wu_plot_WWTP dev.off() # write data frame # PCOA pcoa_data_WWTP_bray <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_bray, "Samples", justDF=T) pcoa_data_WWTP_uu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_uu, "Samples", justDF=T) pcoa_data_WWTP_wu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_wu, "Samples", justDF=T) #NMDS nmds_data_WWTP_bray <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_NMDS_bray, "Samples", justDF=T) nmds_data_WWTP_uu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_NMDS_uu, "Samples", justDF=T) nmds_data_WWTP_wu <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_NMDS_wu, "Samples", justDF=T) ## test group differences of Bray-Curtis dissimilarities using lm head(pcoa_data_WWTP_bray) str(pcoa_data_WWTP_bray) # PCoA Axis 1 LM # linear model lm_axis1_pcoa_bray <- lm(Axis.1 ~ Site + Month ,data=pcoa_data_WWTP_bray) lm_axis1_pcoa_bray2 <- lm(Axis.1 ~ Site + I(Month^2) ,data=pcoa_data_WWTP_bray) anova(lm_axis1_pcoa_bray) # assumptions hist(resid(lm_axis1_pcoa_bray)) plot(resid(lm_axis1_pcoa_bray), fitted(lm_axis1_pcoa_bray)) anova(lm_axis1_pcoa_bray) # PCoA Axis 2 LM # linear model lm_axis2_pcoa_bray <- lm(Axis.2 ~ Site + Month ,data=pcoa_data_WWTP_bray) lm_axis2_pcoa_bray2 <- lm(Axis.2 ~ Site + I(Month^2) ,data=pcoa_data_WWTP_bray) # assumptions hist(resid(lm_axis2_pcoa_bray)) plot(resid(lm_axis2_pcoa_bray), fitted(lm_axis2_pcoa_bray)) anova(lm_axis2_pcoa_bray) # posthoc Contrasts between Sites library(multcomp) summary(glht(lm_axis1_pcoa_bray, mcp(Site = "Tukey")), adjusted(type="bonferroni")) summary(glht(lm_axis2_pcoa_bray, mcp(Site = "Tukey")), adjusted(type="bonferroni")) # plot Axis2 ~ Month (is significant, see anova) ggplot(pcoa_data_WWTP_bray, aes(x=Month, y= Axis.2, color = Site)) + geom_point() pdf("Betadiversity_BrayCurtisdissimilarity_Sites_PCoA.pdf", width = 10, height = 5, useDingbats = F) axis.1_pcoa_bray <- ggplot(pcoa_data_WWTP_bray, aes(x=Site, y= Axis.1, color = Site)) + geom_point(aes(), position = position_jitterdodge(0.5), alpha = 0.3) + geom_boxplot(aes(), position = position_dodge(0.5), alpha = 0.7) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) axis.2_pcoa_bray <- ggplot(pcoa_data_WWTP_bray, aes(x=Site, y= Axis.2, color = Site)) + geom_point(aes(), position = position_jitterdodge(0.5), alpha = 0.3) + geom_boxplot(aes(), position = position_dodge(0.5), alpha = 0.7) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) grid.arrange(axis.1_pcoa_bray, axis.2_pcoa_bray, ncol =2) dev.off() ## test group differences of weighted UniFrac distances using lm head(pcoa_data_WWTP_wu) str(pcoa_data_WWTP_wu) # PCoA Axis 1 LM # linear model lm_axis1_pcoa_wu <- lm(Axis.1 ~ Site + Month ,data=pcoa_data_WWTP_wu) lm_axis1_pcoa_wu2 <- lm(Axis.1 ~ Site + I(Month^2) ,data=pcoa_data_WWTP_wu) anova(lm_axis1_pcoa_wu) # assumptions hist(resid(lm_axis1_pcoa_wu)) plot(resid(lm_axis1_pcoa_wu), fitted(lm_axis1_pcoa_wu)) anova(lm_axis1_pcoa_wu) # PCoA Axis 2 LM # linear model lm_axis2_pcoa_wu <- lm(Axis.2 ~ Site + Month ,data=pcoa_data_WWTP_wu) lm_axis2_pcoa_wu2 <- lm(Axis.2 ~ Site + I(Month^2) ,data=pcoa_data_WWTP_wu) # assumptions hist(resid(lm_axis2_pcoa_wu)) plot(resid(lm_axis2_pcoa_wu), fitted(lm_axis2_pcoa_wu)) anova(lm_axis2_pcoa_wu) # posthoc Contrasts between Sites library(multcomp) summary(glht(lm_axis1_pcoa_wu, mcp(Site = "Tukey")), adjusted(type="bonferroni")) summary(glht(lm_axis2_pcoa_wu, mcp(Site = "Tukey")), adjusted(type="bonferroni")) # plot Axis2 ~ Month (is significant, see anova) ggplot(pcoa_data_WWTP_wu, aes(x=Month, y= Axis.2, color = Site)) + geom_point() pdf("Betadiversity_weightedUniFrac_Sites_PCoA.pdf", width = 10, height = 5, useDingbats = F) axis.1_pcoa_wu <- ggplot(pcoa_data_WWTP_wu, aes(x=Site, y= Axis.1, color = Site)) + geom_point(aes(), position = position_jitterdodge(0.5), alpha = 0.3) + geom_boxplot(aes(), position = position_dodge(0.5), alpha = 0.7) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) axis.2_pcoa_wu <- ggplot(pcoa_data_WWTP_wu, aes(x=Site, y= Axis.2, color = Site)) + geom_point(aes(), position = position_jitterdodge(0.5), alpha = 0.3) + geom_boxplot(aes(), position = position_dodge(0.5), alpha = 0.7) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) gridExtra::grid.arrange(axis.1_pcoa_wu, axis.2_pcoa_wu, ncol =2) dev.off() # plot PCoA ordinations pdf("Bray_WWTP_pcoa.pdf", useDingbats = F, height = 4, width = 6) beta_uu_WWTP_pcoa <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_bray, axes = c(1,2), color = "Site") beta_uu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA Bray curtis - no surface water") dev.off() pdf("UnwUniFrac_WWTP_pcoa.pdf", useDingbats = F, height = 4, width = 6) beta_uu_WWTP_pcoa <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_PCoA_wu, axes = c(1,2), color = "Site") beta_uu_WWTP_pcoa + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)])+ ggtitle("PCoA PCoA unweighted UniFrac- no surface water") dev.off() pdf("WeiUniFrac_WWTP_pcoa.pdf", useDingbats = F, height = 4, width = 6) beta_wu_WWTP_pcoa <- plot_ordination(psdata_IVER_WWTP_rel, ord_WWTP_wu,axes = c(1,2), type="biplot" ,color = "Phylum") beta_wu_WWTP_pcoa + scale_color_manual(values = c(colorset, "black", "grey40", "grey80")) + ggtitle("PCoA weighted UniFrac - no surface water") dev.off() ################################ R -- Differential abundance analysis of 13 focal taxa ################################ ## Rarefy data to minimum sampling depth = 18059 reads per sample # input data set.seed(711) psdata_IVER_cleaned_noNA_18059 <- rarefy_even_depth(psdata_IVER_cleaned_noNA, min(sample_sums(psdata_IVER_cleaned_noNA)), rngseed = 711) psdata_IVER_cleaned_noNA_18059 ## 20191128 ## PVEE library(phyloseq) library(ggplot2) # taxon specific data frames #1 Escherichia/Shigella Escherichia_Shigella_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Esherichica/Shigella") Escherichia_Shigella_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Esherichica/Shigella") #2 Klebsiella Klebsiella_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Klebsiella") Klebsiella_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Klebsiella") #3 Acinetobacter Acinetobacter_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Acinetobacter") Acinetobacter_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Acinetobacter") #4 Mycobacterium Mycobacterium_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Mycobacterium") Mycobacterium_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Mycobacterium") #5 Enterobacter Enterobacter_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Enterobacter") Enterobacter_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Enterobacter") #6 Enterococcus Enterococcus_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Enterococcus") Enterococcus_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Enterococcus") #7 Bacteroides Bacteroides_ps<- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Bacteroides") Bacteroides_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Bacteroides") #8 Pseudomonas Pseudomonas_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Pseudomonas") Pseudomonas_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Pseudomonas") #9 Streptococcus Streptococcus_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Streptococcus") Streptococcus_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Streptococcus") #10 Clostridium Clostridium_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Clostridium_sensu_stricto_1") Clostridium_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Clostridium_sensu_stricto_1") #11 Aeromonas Aeromonas_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Aeromonas") Aeromonas_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Aeromonas") #12 Legionella Legionella_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Legionella") Legionella_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Legionella") #13 Leptospira Leptospira_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Leptospira") Leptospira_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Leptospira") ## adjust data frame parameters #1 sample_data(Escherichia_Shigella_ps)$Site <- factor(sample_data(Escherichia_Shigella_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Escherichia_Shigella_ps_rare)$Site <- factor(sample_data(Escherichia_Shigella_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #2 sample_data(Klebsiella_ps)$Site <- factor(sample_data(Klebsiella_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Klebsiella_ps_rare)$Site <- factor(sample_data(Klebsiella_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #3 sample_data(Acinetobacter_ps)$Site <- factor(sample_data(Acinetobacter_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Acinetobacter_ps_rare)$Site <- factor(sample_data(Acinetobacter_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #4 sample_data(Mycobacterium_ps)$Site <- factor(sample_data(Mycobacterium_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Mycobacterium_ps_rare)$Site <- factor(sample_data(Mycobacterium_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #5 sample_data(Enterobacter_ps)$Site <- factor(sample_data(Enterobacter_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Enterobacter_ps_rare)$Site <- factor(sample_data(Enterobacter_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #6 sample_data(Enterococcus_ps)$Site <- factor(sample_data(Enterococcus_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Enterococcus_ps_rare)$Site <- factor(sample_data(Enterococcus_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #7 sample_data(Bacteroides_ps)$Site <- factor(sample_data(Bacteroides_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Bacteroides_ps_rare)$Site <- factor(sample_data(Bacteroides_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #8 sample_data(Pseudomonas_ps)$Site <- factor(sample_data(Pseudomonas_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Pseudomonas_ps_rare)$Site <- factor(sample_data(Pseudomonas_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #9 sample_data(Streptococcus_ps)$Site <- factor(sample_data(Streptococcus_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Streptococcus_ps_rare)$Site <- factor(sample_data(Streptococcus_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #10 sample_data(Clostridium_ps)$Site <- factor(sample_data(Clostridium_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Clostridium_ps_rare)$Site <- factor(sample_data(Clostridium_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #11 sample_data(Aeromonas_ps)$Site <- factor(sample_data(Aeromonas_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Aeromonas_ps_rare)$Site <- factor(sample_data(Aeromonas_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #12 sample_data(Legionella_ps)$Site <- factor(sample_data(Legionella_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Legionella_ps_rare)$Site <- factor(sample_data(Legionella_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #13 sample_data(Leptospira_ps)$Site <- factor(sample_data(Leptospira_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Leptospira_ps_rare)$Site <- factor(sample_data(Leptospira_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #1 Escherichia_Shigella ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Escherichia_Shigella_ps)$samplesums <- sample_sums(Escherichia_Shigella_ps) df_Escherichia_Shigella <- as.data.frame(sample_data(Escherichia_Shigella_ps)@.Data) colnames(df_Escherichia_Shigella) <- sample_data(Escherichia_Shigella_ps)@names head(df_Escherichia_Shigella) df_Escherichia_Shigella$log_samplesums <- log(df_Escherichia_Shigella$samplesums) df_Escherichia_Shigella$log_samplesums_add1 <- log(df_Escherichia_Shigella$samplesums +1) # same on rarefied data sample_data(Escherichia_Shigella_ps_rare)$samplesums <- sample_sums(Escherichia_Shigella_ps_rare) df_Escherichia_Shigella_rare <- as.data.frame(sample_data(Escherichia_Shigella_ps_rare)@.Data) colnames(df_Escherichia_Shigella_rare) <- sample_data(Escherichia_Shigella_ps_rare)@names head(df_Escherichia_Shigella_rare) df_Escherichia_Shigella_rare$log_samplesums <- log(df_Escherichia_Shigella_rare$samplesums) df_Escherichia_Shigella_rare$log_samplesums_add1 <- log(df_Escherichia_Shigella_rare$samplesums +1) # add total coverage to Escherichia_Shigella df Escherichia_Shigella_samples <- c(as.character(df_Escherichia_Shigella$Sample)) ss_psdata_Escherichia_Shigella <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Escherichia_Shigella_samples) df_Escherichia_Shigella$samplesums_total <- sample_sums(ss_psdata_Escherichia_Shigella) df_Escherichia_Shigella$ratio_ss <- df_Escherichia_Shigella$samplesums/df_Escherichia_Shigella$samplesums_total df_Escherichia_Shigella$log_ratio <- log(df_Escherichia_Shigella$samplesums + 1) - log(df_Escherichia_Shigella$samplesums_total + 1) #plot ggplot(df_Escherichia_Shigella, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Escherichia_Shigella_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Escherichia_Shigella$date <- paste0(df_Escherichia_Shigella$Year, "-",df_Escherichia_Shigella$Month,"-",df_Escherichia_Shigella$Day) df_Escherichia_Shigella$date <- as.Date(df_Escherichia_Shigella$date, format = "%Y-%m-%d") df_Escherichia_Shigella_rare$date <- paste0(df_Escherichia_Shigella_rare$Year, "-",df_Escherichia_Shigella_rare$Month,"-",df_Escherichia_Shigella_rare$Day) df_Escherichia_Shigella_rare$date <- as.Date(df_Escherichia_Shigella_rare$date, format = "%Y-%m-%d") # linear regression lm_Escherichia_Shigella <- lm(data=df_Escherichia_Shigella, log_ratio ~ date + Site) anova(lm_Escherichia_Shigella) summary(lm_Escherichia_Shigella) # model assumptions hist(resid(lm_Escherichia_Shigella)) plot(resid(lm_Escherichia_Shigella) ~ fitted(lm_Escherichia_Shigella)) library(PMCMR) attach(df_Escherichia_Shigella) ph_kwtest_Escherichia_Shigella <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Escherichia_Shigella, file = "Stats_Escherichia_Shigella_logRatio_Site.txt") detach(df_Escherichia_Shigella) # Escherichia_Shigella rarefied data # linear regression lm_Escherichia_Shigella_rare <- lm(data=df_Escherichia_Shigella_rare, log_samplesums_add1 ~ date + Site) anova(lm_Escherichia_Shigella_rare) summary(lm_Escherichia_Shigella_rare) # model assumptions hist(resid(lm_Escherichia_Shigella_rare)) plot(resid(lm_Escherichia_Shigella_rare) ~ fitted(lm_Escherichia_Shigella_rare)) library(PMCMR) attach(df_Escherichia_Shigella_rare) ph_kwtest_Escherichia_Shigella_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Escherichia_Shigella_rare, file = "Stats_Escherichia_Shigella_logCount_rare18059_Site.txt") detach(df_Escherichia_Shigella_rare) kruskal_Escherichia_Shigella_rare <- kruskal.test(df_Escherichia_Shigella_rare$log_samplesums_add1, df_Escherichia_Shigella$Site) capture.output(kruskal_Escherichia_Shigella_rare, file = "kruskal_Escherichia_Shigella_rare.txt") #2 Klebsiella ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Klebsiella_ps)$samplesums <- sample_sums(Klebsiella_ps) df_Klebsiella <- as.data.frame(sample_data(Klebsiella_ps)@.Data) colnames(df_Klebsiella) <- sample_data(Klebsiella_ps)@names head(df_Klebsiella) df_Klebsiella$log_samplesums <- log(df_Klebsiella$samplesums) df_Klebsiella$log_samplesums_add1 <- log(df_Klebsiella$samplesums +1) # same on rarefied data sample_data(Klebsiella_ps_rare)$samplesums <- sample_sums(Klebsiella_ps_rare) df_Klebsiella_rare <- as.data.frame(sample_data(Klebsiella_ps_rare)@.Data) colnames(df_Klebsiella_rare) <- sample_data(Klebsiella_ps_rare)@names head(df_Klebsiella_rare) df_Klebsiella_rare$log_samplesums <- log(df_Klebsiella_rare$samplesums) df_Klebsiella_rare$log_samplesums_add1 <- log(df_Klebsiella_rare$samplesums +1) # add total coverage to Klebsiella df Klebsiella_samples <- c(as.character(df_Klebsiella$Sample)) ss_psdata_Klebsiella <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Klebsiella_samples) df_Klebsiella$samplesums_total <- sample_sums(ss_psdata_Klebsiella) df_Klebsiella$ratio_ss <- df_Klebsiella$samplesums/df_Klebsiella$samplesums_total df_Klebsiella$log_ratio <- log(df_Klebsiella$samplesums + 1) - log(df_Klebsiella$samplesums_total + 1) #plot ggplot(df_Klebsiella, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Klebsiella_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Klebsiella$date <- paste0(df_Klebsiella$Year, "-",df_Klebsiella$Month,"-",df_Klebsiella$Day) df_Klebsiella$date <- as.Date(df_Klebsiella$date, format = "%Y-%m-%d") df_Klebsiella_rare$date <- paste0(df_Klebsiella_rare$Year, "-",df_Klebsiella_rare$Month,"-",df_Klebsiella_rare$Day) df_Klebsiella_rare$date <- as.Date(df_Klebsiella_rare$date, format = "%Y-%m-%d") # linear regression lm_Klebsiella <- lm(data=df_Klebsiella, log_ratio ~ date + Site) anova(lm_Klebsiella) summary(lm_Klebsiella) # model assumptions hist(resid(lm_Klebsiella)) plot(resid(lm_Klebsiella) ~ fitted(lm_Klebsiella)) library(PMCMR) attach(df_Klebsiella) ph_kwtest_Klebsiella <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Klebsiella, file = "Stats_Klebsiella_logRatio_Site.txt") detach(df_Klebsiella) # Klebsiella rarefied data # linear regression lm_Klebsiella_rare <- lm(data=df_Klebsiella_rare, log_samplesums_add1 ~ date + Site) anova(lm_Klebsiella_rare) summary(lm_Klebsiella_rare) # model assumptions hist(resid(lm_Klebsiella_rare)) plot(resid(lm_Klebsiella_rare) ~ fitted(lm_Klebsiella_rare)) library(PMCMR) attach(df_Klebsiella_rare) ph_kwtest_Klebsiella_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Klebsiella_rare, file = "Stats_Klebsiella_logCount_rare18059_Site.txt") detach(df_Klebsiella_rare) kruskal_Klebsiella_rare <- kruskal.test(df_Klebsiella_rare$log_samplesums_add1, df_Klebsiella$Site) capture.output(kruskal_Klebsiella_rare, file = "kruskal_Klebsiella_rare.txt") #3 Acinetobacter ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Acinetobacter_ps)$samplesums <- sample_sums(Acinetobacter_ps) df_Acinetobacter <- as.data.frame(sample_data(Acinetobacter_ps)@.Data) colnames(df_Acinetobacter) <- sample_data(Acinetobacter_ps)@names head(df_Acinetobacter) df_Acinetobacter$log_samplesums <- log(df_Acinetobacter$samplesums) df_Acinetobacter$log_samplesums_add1 <- log(df_Acinetobacter$samplesums +1) # same on rarefied data sample_data(Acinetobacter_ps_rare)$samplesums <- sample_sums(Acinetobacter_ps_rare) df_Acinetobacter_rare <- as.data.frame(sample_data(Acinetobacter_ps_rare)@.Data) colnames(df_Acinetobacter_rare) <- sample_data(Acinetobacter_ps_rare)@names head(df_Acinetobacter_rare) df_Acinetobacter_rare$log_samplesums <- log(df_Acinetobacter_rare$samplesums) df_Acinetobacter_rare$log_samplesums_add1 <- log(df_Acinetobacter_rare$samplesums +1) # add total coverage to Acinetobacter df Acinetobacter_samples <- c(as.character(df_Acinetobacter$Sample)) ss_psdata_Acinetobacter <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Acinetobacter_samples) df_Acinetobacter$samplesums_total <- sample_sums(ss_psdata_Acinetobacter) df_Acinetobacter$ratio_ss <- df_Acinetobacter$samplesums/df_Acinetobacter$samplesums_total df_Acinetobacter$log_ratio <- log(df_Acinetobacter$samplesums + 1) - log(df_Acinetobacter$samplesums_total + 1) #plot ggplot(df_Acinetobacter, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Acinetobacter_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Acinetobacter$date <- paste0(df_Acinetobacter$Year, "-",df_Acinetobacter$Month,"-",df_Acinetobacter$Day) df_Acinetobacter$date <- as.Date(df_Acinetobacter$date, format = "%Y-%m-%d") df_Acinetobacter_rare$date <- paste0(df_Acinetobacter_rare$Year, "-",df_Acinetobacter_rare$Month,"-",df_Acinetobacter_rare$Day) df_Acinetobacter_rare$date <- as.Date(df_Acinetobacter_rare$date, format = "%Y-%m-%d") # linear regression lm_Acinetobacter <- lm(data=df_Acinetobacter, log_ratio ~ date + Site) anova(lm_Acinetobacter) summary(lm_Acinetobacter) # model assumptions hist(resid(lm_Acinetobacter)) plot(resid(lm_Acinetobacter) ~ fitted(lm_Acinetobacter)) library(PMCMR) attach(df_Acinetobacter) ph_kwtest_Acinetobacter <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Acinetobacter, file = "Stats_Acinetobacter_logRatio_Site.txt") detach(df_Acinetobacter) # Acinetobacter rarefied data # linear regression lm_Acinetobacter_rare <- lm(data=df_Acinetobacter_rare, log_samplesums_add1 ~ date + Site) anova(lm_Acinetobacter_rare) summary(lm_Acinetobacter_rare) # model assumptions hist(resid(lm_Acinetobacter_rare)) plot(resid(lm_Acinetobacter_rare) ~ fitted(lm_Acinetobacter_rare)) library(PMCMR) attach(df_Acinetobacter_rare) ph_kwtest_Acinetobacter_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Acinetobacter_rare, file = "Stats_Acinetobacter_logCount_rare18059_Site.txt") detach(df_Acinetobacter_rare) kruskal_Acinetobacter_rare <- kruskal.test(df_Acinetobacter_rare$log_samplesums_add1, df_Acinetobacter$Site) capture.output(kruskal_Acinetobacter_rare, file = "kruskal_Acinetobacter_rare.txt") #4 Mycobacterium ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Mycobacterium_ps)$samplesums <- sample_sums(Mycobacterium_ps) df_Mycobacterium <- as.data.frame(sample_data(Mycobacterium_ps)@.Data) colnames(df_Mycobacterium) <- sample_data(Mycobacterium_ps)@names head(df_Mycobacterium) df_Mycobacterium$log_samplesums <- log(df_Mycobacterium$samplesums) df_Mycobacterium$log_samplesums_add1 <- log(df_Mycobacterium$samplesums +1) # same on rarefied data sample_data(Mycobacterium_ps_rare)$samplesums <- sample_sums(Mycobacterium_ps_rare) df_Mycobacterium_rare <- as.data.frame(sample_data(Mycobacterium_ps_rare)@.Data) colnames(df_Mycobacterium_rare) <- sample_data(Mycobacterium_ps_rare)@names head(df_Mycobacterium_rare) df_Mycobacterium_rare$log_samplesums <- log(df_Mycobacterium_rare$samplesums) df_Mycobacterium_rare$log_samplesums_add1 <- log(df_Mycobacterium_rare$samplesums +1) # add total coverage to Mycobacterium df Mycobacterium_samples <- c(as.character(df_Mycobacterium$Sample)) ss_psdata_Mycobacterium <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Mycobacterium_samples) df_Mycobacterium$samplesums_total <- sample_sums(ss_psdata_Mycobacterium) df_Mycobacterium$ratio_ss <- df_Mycobacterium$samplesums/df_Mycobacterium$samplesums_total df_Mycobacterium$log_ratio <- log(df_Mycobacterium$samplesums + 1) - log(df_Mycobacterium$samplesums_total + 1) #plot ggplot(df_Mycobacterium, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Mycobacterium_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Mycobacterium$date <- paste0(df_Mycobacterium$Year, "-",df_Mycobacterium$Month,"-",df_Mycobacterium$Day) df_Mycobacterium$date <- as.Date(df_Mycobacterium$date, format = "%Y-%m-%d") df_Mycobacterium_rare$date <- paste0(df_Mycobacterium_rare$Year, "-",df_Mycobacterium_rare$Month,"-",df_Mycobacterium_rare$Day) df_Mycobacterium_rare$date <- as.Date(df_Mycobacterium_rare$date, format = "%Y-%m-%d") # linear regression lm_Mycobacterium <- lm(data=df_Mycobacterium, log_ratio ~ date + Site) anova(lm_Mycobacterium) summary(lm_Mycobacterium) # model assumptions hist(resid(lm_Mycobacterium)) plot(resid(lm_Mycobacterium) ~ fitted(lm_Mycobacterium)) library(PMCMR) attach(df_Mycobacterium) ph_kwtest_Mycobacterium <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Mycobacterium, file = "Stats_Mycobacterium_logRatio_Site.txt") detach(df_Mycobacterium) # Mycobacterium rarefied data # linear regression lm_Mycobacterium_rare <- lm(data=df_Mycobacterium_rare, log_samplesums_add1 ~ date + Site) anova(lm_Mycobacterium_rare) summary(lm_Mycobacterium_rare) # model assumptions hist(resid(lm_Mycobacterium_rare)) plot(resid(lm_Mycobacterium_rare) ~ fitted(lm_Mycobacterium_rare)) library(PMCMR) attach(df_Mycobacterium_rare) ph_kwtest_Mycobacterium_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Mycobacterium_rare, file = "Stats_Mycobacterium_logCount_rare18059_Site.txt") detach(df_Mycobacterium_rare) kruskal_Mycobacterium_rare <- kruskal.test(df_Mycobacterium_rare$log_samplesums_add1, df_Mycobacterium$Site) capture.output(kruskal_Mycobacterium_rare, file = "kruskal_Mycobacterium_rare.txt") #5 Enterobacter ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Enterobacter_ps)$samplesums <- sample_sums(Enterobacter_ps) df_Enterobacter <- as.data.frame(sample_data(Enterobacter_ps)@.Data) colnames(df_Enterobacter) <- sample_data(Enterobacter_ps)@names head(df_Enterobacter) df_Enterobacter$log_samplesums <- log(df_Enterobacter$samplesums) df_Enterobacter$log_samplesums_add1 <- log(df_Enterobacter$samplesums +1) # same on rarefied data sample_data(Enterobacter_ps_rare)$samplesums <- sample_sums(Enterobacter_ps_rare) df_Enterobacter_rare <- as.data.frame(sample_data(Enterobacter_ps_rare)@.Data) colnames(df_Enterobacter_rare) <- sample_data(Enterobacter_ps_rare)@names head(df_Enterobacter_rare) df_Enterobacter_rare$log_samplesums <- log(df_Enterobacter_rare$samplesums) df_Enterobacter_rare$log_samplesums_add1 <- log(df_Enterobacter_rare$samplesums +1) # add total coverage to Enterobacter df Enterobacter_samples <- c(as.character(df_Enterobacter$Sample)) ss_psdata_Enterobacter <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Enterobacter_samples) df_Enterobacter$samplesums_total <- sample_sums(ss_psdata_Enterobacter) df_Enterobacter$ratio_ss <- df_Enterobacter$samplesums/df_Enterobacter$samplesums_total df_Enterobacter$log_ratio <- log(df_Enterobacter$samplesums + 1) - log(df_Enterobacter$samplesums_total + 1) #plot ggplot(df_Enterobacter, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Enterobacter_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Enterobacter$date <- paste0(df_Enterobacter$Year, "-",df_Enterobacter$Month,"-",df_Enterobacter$Day) df_Enterobacter$date <- as.Date(df_Enterobacter$date, format = "%Y-%m-%d") df_Enterobacter_rare$date <- paste0(df_Enterobacter_rare$Year, "-",df_Enterobacter_rare$Month,"-",df_Enterobacter_rare$Day) df_Enterobacter_rare$date <- as.Date(df_Enterobacter_rare$date, format = "%Y-%m-%d") # linear regression lm_Enterobacter <- lm(data=df_Enterobacter, log_ratio ~ date + Site) anova(lm_Enterobacter) summary(lm_Enterobacter) # model assumptions hist(resid(lm_Enterobacter)) plot(resid(lm_Enterobacter) ~ fitted(lm_Enterobacter)) library(PMCMR) attach(df_Enterobacter) ph_kwtest_Enterobacter <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Enterobacter, file = "Stats_Enterobacter_logRatio_Site.txt") detach(df_Enterobacter) # Enterobacter rarefied data # linear regression lm_Enterobacter_rare <- lm(data=df_Enterobacter_rare, log_samplesums_add1 ~ date + Site) anova(lm_Enterobacter_rare) summary(lm_Enterobacter_rare) # model assumptions hist(resid(lm_Enterobacter_rare)) plot(resid(lm_Enterobacter_rare) ~ fitted(lm_Enterobacter_rare)) library(PMCMR) attach(df_Enterobacter_rare) ph_kwtest_Enterobacter_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Enterobacter_rare, file = "Stats_Enterobacter_logCount_rare18059_Site.txt") detach(df_Enterobacter_rare) kruskal_Enterobacter_rare <- kruskal.test(df_Enterobacter_rare$log_samplesums_add1, df_Enterobacter$Site) capture.output(kruskal_Enterobacter_rare, file = "kruskal_Enterobacter_rare.txt") #6 Enterococcus ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Enterococcus_ps)$samplesums <- sample_sums(Enterococcus_ps) df_Enterococcus <- as.data.frame(sample_data(Enterococcus_ps)@.Data) colnames(df_Enterococcus) <- sample_data(Enterococcus_ps)@names head(df_Enterococcus) df_Enterococcus$log_samplesums <- log(df_Enterococcus$samplesums) df_Enterococcus$log_samplesums_add1 <- log(df_Enterococcus$samplesums +1) # same on rarefied data sample_data(Enterococcus_ps_rare)$samplesums <- sample_sums(Enterococcus_ps_rare) df_Enterococcus_rare <- as.data.frame(sample_data(Enterococcus_ps_rare)@.Data) colnames(df_Enterococcus_rare) <- sample_data(Enterococcus_ps_rare)@names head(df_Enterococcus_rare) df_Enterococcus_rare$log_samplesums <- log(df_Enterococcus_rare$samplesums) df_Enterococcus_rare$log_samplesums_add1 <- log(df_Enterococcus_rare$samplesums +1) # add total coverage to Enterococcus df Enterococcus_samples <- c(as.character(df_Enterococcus$Sample)) ss_psdata_Enterococcus <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Enterococcus_samples) df_Enterococcus$samplesums_total <- sample_sums(ss_psdata_Enterococcus) df_Enterococcus$ratio_ss <- df_Enterococcus$samplesums/df_Enterococcus$samplesums_total df_Enterococcus$log_ratio <- log(df_Enterococcus$samplesums + 1) - log(df_Enterococcus$samplesums_total + 1) #plot ggplot(df_Enterococcus, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Enterococcus_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Enterococcus$date <- paste0(df_Enterococcus$Year, "-",df_Enterococcus$Month,"-",df_Enterococcus$Day) df_Enterococcus$date <- as.Date(df_Enterococcus$date, format = "%Y-%m-%d") df_Enterococcus_rare$date <- paste0(df_Enterococcus_rare$Year, "-",df_Enterococcus_rare$Month,"-",df_Enterococcus_rare$Day) df_Enterococcus_rare$date <- as.Date(df_Enterococcus_rare$date, format = "%Y-%m-%d") # linear regression lm_Enterococcus <- lm(data=df_Enterococcus, log_ratio ~ date + Site) anova(lm_Enterococcus) summary(lm_Enterococcus) # model assumptions hist(resid(lm_Enterococcus)) plot(resid(lm_Enterococcus) ~ fitted(lm_Enterococcus)) library(PMCMR) attach(df_Enterococcus) ph_kwtest_Enterococcus <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Enterococcus, file = "Stats_Enterococcus_logRatio_Site.txt") detach(df_Enterococcus) # Enterococcus rarefied data # linear regression lm_Enterococcus_rare <- lm(data=df_Enterococcus_rare, log_samplesums_add1 ~ date + Site) anova(lm_Enterococcus_rare) summary(lm_Enterococcus_rare) # model assumptions hist(resid(lm_Enterococcus_rare)) plot(resid(lm_Enterococcus_rare) ~ fitted(lm_Enterococcus_rare)) library(PMCMR) attach(df_Enterococcus_rare) ph_kwtest_Enterococcus_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Enterococcus_rare, file = "Stats_Enterococcus_logCount_rare18059_Site.txt") detach(df_Enterococcus_rare) kruskal_Enterococcus_rare <- kruskal.test(df_Enterococcus_rare$log_samplesums_add1, df_Enterococcus$Site) capture.output(kruskal_Enterococcus_rare, file = "kruskal_Enterococcus_rare.txt") #7 Bacteroides ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Bacteroides_ps)$samplesums <- sample_sums(Bacteroides_ps) df_Bacteroides <- as.data.frame(sample_data(Bacteroides_ps)@.Data) colnames(df_Bacteroides) <- sample_data(Bacteroides_ps)@names head(df_Bacteroides) df_Bacteroides$log_samplesums <- log(df_Bacteroides$samplesums) df_Bacteroides$log_samplesums_add1 <- log(df_Bacteroides$samplesums +1) # same on rarefied data sample_data(Bacteroides_ps_rare)$samplesums <- sample_sums(Bacteroides_ps_rare) df_Bacteroides_rare <- as.data.frame(sample_data(Bacteroides_ps_rare)@.Data) colnames(df_Bacteroides_rare) <- sample_data(Bacteroides_ps_rare)@names head(df_Bacteroides_rare) df_Bacteroides_rare$log_samplesums <- log(df_Bacteroides_rare$samplesums) df_Bacteroides_rare$log_samplesums_add1 <- log(df_Bacteroides_rare$samplesums +1) # add total coverage to Bacteroides df Bacteroides_samples <- c(as.character(df_Bacteroides$Sample)) ss_psdata_Bacteroides <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Bacteroides_samples) df_Bacteroides$samplesums_total <- sample_sums(ss_psdata_Bacteroides) df_Bacteroides$ratio_ss <- df_Bacteroides$samplesums/df_Bacteroides$samplesums_total df_Bacteroides$log_ratio <- log(df_Bacteroides$samplesums + 1) - log(df_Bacteroides$samplesums_total + 1) #plot ggplot(df_Bacteroides, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Bacteroides_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Bacteroides$date <- paste0(df_Bacteroides$Year, "-",df_Bacteroides$Month,"-",df_Bacteroides$Day) df_Bacteroides$date <- as.Date(df_Bacteroides$date, format = "%Y-%m-%d") df_Bacteroides_rare$date <- paste0(df_Bacteroides_rare$Year, "-",df_Bacteroides_rare$Month,"-",df_Bacteroides_rare$Day) df_Bacteroides_rare$date <- as.Date(df_Bacteroides_rare$date, format = "%Y-%m-%d") # linear regression lm_Bacteroides <- lm(data=df_Bacteroides, log_ratio ~ date + Site) anova(lm_Bacteroides) summary(lm_Bacteroides) # model assumptions hist(resid(lm_Bacteroides)) plot(resid(lm_Bacteroides) ~ fitted(lm_Bacteroides)) library(PMCMR) attach(df_Bacteroides) ph_kwtest_Bacteroides <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Bacteroides, file = "Stats_Bacteroides_logRatio_Site.txt") detach(df_Bacteroides) # Bacteroides rarefied data # linear regression lm_Bacteroides_rare <- lm(data=df_Bacteroides_rare, log_samplesums_add1 ~ date + Site) anova(lm_Bacteroides_rare) summary(lm_Bacteroides_rare) # model assumptions hist(resid(lm_Bacteroides_rare)) plot(resid(lm_Bacteroides_rare) ~ fitted(lm_Bacteroides_rare)) library(PMCMR) attach(df_Bacteroides_rare) ph_kwtest_Bacteroides_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Bacteroides_rare, file = "Stats_Bacteroides_logCount_rare18059_Site.txt") detach(df_Bacteroides_rare) kruskal_Bacteroides_rare <- kruskal.test(df_Bacteroides_rare$log_samplesums_add1, df_Bacteroides$Site) capture.output(kruskal_Bacteroides_rare, file = "kruskal_Bacteroides_rare.txt") #8 Pseudomonas ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Pseudomonas_ps)$samplesums <- sample_sums(Pseudomonas_ps) df_Pseudomonas <- as.data.frame(sample_data(Pseudomonas_ps)@.Data) colnames(df_Pseudomonas) <- sample_data(Pseudomonas_ps)@names head(df_Pseudomonas) df_Pseudomonas$log_samplesums <- log(df_Pseudomonas$samplesums) df_Pseudomonas$log_samplesums_add1 <- log(df_Pseudomonas$samplesums +1) # same on rarefied data sample_data(Pseudomonas_ps_rare)$samplesums <- sample_sums(Pseudomonas_ps_rare) df_Pseudomonas_rare <- as.data.frame(sample_data(Pseudomonas_ps_rare)@.Data) colnames(df_Pseudomonas_rare) <- sample_data(Pseudomonas_ps_rare)@names head(df_Pseudomonas_rare) df_Pseudomonas_rare$log_samplesums <- log(df_Pseudomonas_rare$samplesums) df_Pseudomonas_rare$log_samplesums_add1 <- log(df_Pseudomonas_rare$samplesums +1) # add total coverage to Pseudomonas df Pseudomonas_samples <- c(as.character(df_Pseudomonas$Sample)) ss_psdata_Pseudomonas <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Pseudomonas_samples) df_Pseudomonas$samplesums_total <- sample_sums(ss_psdata_Pseudomonas) df_Pseudomonas$ratio_ss <- df_Pseudomonas$samplesums/df_Pseudomonas$samplesums_total df_Pseudomonas$log_ratio <- log(df_Pseudomonas$samplesums + 1) - log(df_Pseudomonas$samplesums_total + 1) #plot ggplot(df_Pseudomonas, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Pseudomonas_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Pseudomonas$date <- paste0(df_Pseudomonas$Year, "-",df_Pseudomonas$Month,"-",df_Pseudomonas$Day) df_Pseudomonas$date <- as.Date(df_Pseudomonas$date, format = "%Y-%m-%d") df_Pseudomonas_rare$date <- paste0(df_Pseudomonas_rare$Year, "-",df_Pseudomonas_rare$Month,"-",df_Pseudomonas_rare$Day) df_Pseudomonas_rare$date <- as.Date(df_Pseudomonas_rare$date, format = "%Y-%m-%d") # linear regression lm_Pseudomonas <- lm(data=df_Pseudomonas, log_ratio ~ date + Site) anova(lm_Pseudomonas) summary(lm_Pseudomonas) # model assumptions hist(resid(lm_Pseudomonas)) plot(resid(lm_Pseudomonas) ~ fitted(lm_Pseudomonas)) library(PMCMR) attach(df_Pseudomonas) ph_kwtest_Pseudomonas <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Pseudomonas, file = "Stats_Pseudomonas_logRatio_Site.txt") detach(df_Pseudomonas) # Pseudomonas rarefied data # linear regression lm_Pseudomonas_rare <- lm(data=df_Pseudomonas_rare, log_samplesums_add1 ~ date + Site) anova(lm_Pseudomonas_rare) summary(lm_Pseudomonas_rare) # model assumptions hist(resid(lm_Pseudomonas_rare)) plot(resid(lm_Pseudomonas_rare) ~ fitted(lm_Pseudomonas_rare)) library(PMCMR) attach(df_Pseudomonas_rare) ph_kwtest_Pseudomonas_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Pseudomonas_rare, file = "Stats_Pseudomonas_logCount_rare18059_Site.txt") detach(df_Pseudomonas_rare) kruskal_Pseudomonas_rare <- kruskal.test(df_Pseudomonas_rare$log_samplesums_add1, df_Pseudomonas$Site) capture.output(kruskal_Pseudomonas_rare, file = "kruskal_Pseudomonas_rare.txt") #9 Streptococcus ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Streptococcus_ps)$samplesums <- sample_sums(Streptococcus_ps) df_Streptococcus <- as.data.frame(sample_data(Streptococcus_ps)@.Data) colnames(df_Streptococcus) <- sample_data(Streptococcus_ps)@names head(df_Streptococcus) df_Streptococcus$log_samplesums <- log(df_Streptococcus$samplesums) df_Streptococcus$log_samplesums_add1 <- log(df_Streptococcus$samplesums +1) # same on rarefied data sample_data(Streptococcus_ps_rare)$samplesums <- sample_sums(Streptococcus_ps_rare) df_Streptococcus_rare <- as.data.frame(sample_data(Streptococcus_ps_rare)@.Data) colnames(df_Streptococcus_rare) <- sample_data(Streptococcus_ps_rare)@names head(df_Streptococcus_rare) df_Streptococcus_rare$log_samplesums <- log(df_Streptococcus_rare$samplesums) df_Streptococcus_rare$log_samplesums_add1 <- log(df_Streptococcus_rare$samplesums +1) # add total coverage to Streptococcus df Streptococcus_samples <- c(as.character(df_Streptococcus$Sample)) ss_psdata_Streptococcus <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Streptococcus_samples) df_Streptococcus$samplesums_total <- sample_sums(ss_psdata_Streptococcus) df_Streptococcus$ratio_ss <- df_Streptococcus$samplesums/df_Streptococcus$samplesums_total df_Streptococcus$log_ratio <- log(df_Streptococcus$samplesums + 1) - log(df_Streptococcus$samplesums_total + 1) #plot ggplot(df_Streptococcus, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Streptococcus_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Streptococcus$date <- paste0(df_Streptococcus$Year, "-",df_Streptococcus$Month,"-",df_Streptococcus$Day) df_Streptococcus$date <- as.Date(df_Streptococcus$date, format = "%Y-%m-%d") df_Streptococcus_rare$date <- paste0(df_Streptococcus_rare$Year, "-",df_Streptococcus_rare$Month,"-",df_Streptococcus_rare$Day) df_Streptococcus_rare$date <- as.Date(df_Streptococcus_rare$date, format = "%Y-%m-%d") # linear regression lm_Streptococcus <- lm(data=df_Streptococcus, log_ratio ~ date + Site) anova(lm_Streptococcus) summary(lm_Streptococcus) # model assumptions hist(resid(lm_Streptococcus)) plot(resid(lm_Streptococcus) ~ fitted(lm_Streptococcus)) library(PMCMR) attach(df_Streptococcus) ph_kwtest_Streptococcus <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Streptococcus, file = "Stats_Streptococcus_logRatio_Site.txt") detach(df_Streptococcus) # Streptococcus rarefied data # linear regression lm_Streptococcus_rare <- lm(data=df_Streptococcus_rare, log_samplesums_add1 ~ date + Site) anova(lm_Streptococcus_rare) summary(lm_Streptococcus_rare) # model assumptions hist(resid(lm_Streptococcus_rare)) plot(resid(lm_Streptococcus_rare) ~ fitted(lm_Streptococcus_rare)) library(PMCMR) attach(df_Streptococcus_rare) ph_kwtest_Streptococcus_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Streptococcus_rare, file = "Stats_Streptococcus_logCount_rare18059_Site.txt") detach(df_Streptococcus_rare) kruskal_Streptococcus_rare <- kruskal.test(df_Streptococcus_rare$log_samplesums_add1, df_Streptococcus$Site) capture.output(kruskal_Streptococcus_rare, file = "kruskal_Streptococcus_rare.txt") #10 Clostridium ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Clostridium_ps)$samplesums <- sample_sums(Clostridium_ps) df_Clostridium <- as.data.frame(sample_data(Clostridium_ps)@.Data) colnames(df_Clostridium) <- sample_data(Clostridium_ps)@names head(df_Clostridium) df_Clostridium$log_samplesums <- log(df_Clostridium$samplesums) df_Clostridium$log_samplesums_add1 <- log(df_Clostridium$samplesums +1) # same on rarefied data sample_data(Clostridium_ps_rare)$samplesums <- sample_sums(Clostridium_ps_rare) df_Clostridium_rare <- as.data.frame(sample_data(Clostridium_ps_rare)@.Data) colnames(df_Clostridium_rare) <- sample_data(Clostridium_ps_rare)@names head(df_Clostridium_rare) df_Clostridium_rare$log_samplesums <- log(df_Clostridium_rare$samplesums) df_Clostridium_rare$log_samplesums_add1 <- log(df_Clostridium_rare$samplesums +1) # add total coverage to Clostridium df Clostridium_samples <- c(as.character(df_Clostridium$Sample)) ss_psdata_Clostridium <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Clostridium_samples) df_Clostridium$samplesums_total <- sample_sums(ss_psdata_Clostridium) df_Clostridium$ratio_ss <- df_Clostridium$samplesums/df_Clostridium$samplesums_total df_Clostridium$log_ratio <- log(df_Clostridium$samplesums + 1) - log(df_Clostridium$samplesums_total + 1) #plot ggplot(df_Clostridium, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Clostridium_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Clostridium$date <- paste0(df_Clostridium$Year, "-",df_Clostridium$Month,"-",df_Clostridium$Day) df_Clostridium$date <- as.Date(df_Clostridium$date, format = "%Y-%m-%d") df_Clostridium_rare$date <- paste0(df_Clostridium_rare$Year, "-",df_Clostridium_rare$Month,"-",df_Clostridium_rare$Day) df_Clostridium_rare$date <- as.Date(df_Clostridium_rare$date, format = "%Y-%m-%d") # linear regression lm_Clostridium <- lm(data=df_Clostridium, log_ratio ~ date + Site) anova(lm_Clostridium) summary(lm_Clostridium) # model assumptions hist(resid(lm_Clostridium)) plot(resid(lm_Clostridium) ~ fitted(lm_Clostridium)) library(PMCMR) attach(df_Clostridium) ph_kwtest_Clostridium <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Clostridium, file = "Stats_Clostridium_logRatio_Site.txt") detach(df_Clostridium) # Clostridium rarefied data # linear regression lm_Clostridium_rare <- lm(data=df_Clostridium_rare, log_samplesums_add1 ~ date + Site) anova(lm_Clostridium_rare) summary(lm_Clostridium_rare) # model assumptions hist(resid(lm_Clostridium_rare)) plot(resid(lm_Clostridium_rare) ~ fitted(lm_Clostridium_rare)) library(PMCMR) attach(df_Clostridium_rare) ph_kwtest_Clostridium_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Clostridium_rare, file = "Stats_Clostridium_logCount_rare18059_Site.txt") detach(df_Clostridium_rare) kruskal_Clostridium_rare <- kruskal.test(df_Clostridium_rare$log_samplesums_add1, df_Clostridium$Site) capture.output(kruskal_Clostridium_rare, file = "kruskal_Clostridium_rare.txt") #11 Aeromonas ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Aeromonas_ps)$samplesums <- sample_sums(Aeromonas_ps) df_Aeromonas <- as.data.frame(sample_data(Aeromonas_ps)@.Data) colnames(df_Aeromonas) <- sample_data(Aeromonas_ps)@names head(df_Aeromonas) df_Aeromonas$log_samplesums <- log(df_Aeromonas$samplesums) df_Aeromonas$log_samplesums_add1 <- log(df_Aeromonas$samplesums +1) # same on rarefied data sample_data(Aeromonas_ps_rare)$samplesums <- sample_sums(Aeromonas_ps_rare) df_Aeromonas_rare <- as.data.frame(sample_data(Aeromonas_ps_rare)@.Data) colnames(df_Aeromonas_rare) <- sample_data(Aeromonas_ps_rare)@names head(df_Aeromonas_rare) df_Aeromonas_rare$log_samplesums <- log(df_Aeromonas_rare$samplesums) df_Aeromonas_rare$log_samplesums_add1 <- log(df_Aeromonas_rare$samplesums +1) # add total coverage to Aeromonas df Aeromonas_samples <- c(as.character(df_Aeromonas$Sample)) ss_psdata_Aeromonas <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Aeromonas_samples) df_Aeromonas$samplesums_total <- sample_sums(ss_psdata_Aeromonas) df_Aeromonas$ratio_ss <- df_Aeromonas$samplesums/df_Aeromonas$samplesums_total df_Aeromonas$log_ratio <- log(df_Aeromonas$samplesums + 1) - log(df_Aeromonas$samplesums_total + 1) #plot ggplot(df_Aeromonas, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Aeromonas_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Aeromonas$date <- paste0(df_Aeromonas$Year, "-",df_Aeromonas$Month,"-",df_Aeromonas$Day) df_Aeromonas$date <- as.Date(df_Aeromonas$date, format = "%Y-%m-%d") df_Aeromonas_rare$date <- paste0(df_Aeromonas_rare$Year, "-",df_Aeromonas_rare$Month,"-",df_Aeromonas_rare$Day) df_Aeromonas_rare$date <- as.Date(df_Aeromonas_rare$date, format = "%Y-%m-%d") # linear regression lm_Aeromonas <- lm(data=df_Aeromonas, log_ratio ~ date + Site) anova(lm_Aeromonas) summary(lm_Aeromonas) # model assumptions hist(resid(lm_Aeromonas)) plot(resid(lm_Aeromonas) ~ fitted(lm_Aeromonas)) library(PMCMR) attach(df_Aeromonas) ph_kwtest_Aeromonas <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Aeromonas, file = "Stats_Aeromonas_logRatio_Site.txt") detach(df_Aeromonas) # Aeromonas rarefied data # linear regression lm_Aeromonas_rare <- lm(data=df_Aeromonas_rare, log_samplesums_add1 ~ date + Site) anova(lm_Aeromonas_rare) summary(lm_Aeromonas_rare) # model assumptions hist(resid(lm_Aeromonas_rare)) plot(resid(lm_Aeromonas_rare) ~ fitted(lm_Aeromonas_rare)) library(PMCMR) attach(df_Aeromonas_rare) ph_kwtest_Aeromonas_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Aeromonas_rare, file = "Stats_Aeromonas_logCount_rare18059_Site.txt") detach(df_Aeromonas_rare) kruskal_Aeromonas_rare <- kruskal.test(df_Aeromonas_rare$log_samplesums_add1, df_Aeromonas$Site) capture.output(kruskal_Aeromonas_rare, file = "kruskal_Aeromonas_rare.txt") #12 Legionella ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Legionella_ps)$samplesums <- sample_sums(Legionella_ps) df_Legionella <- as.data.frame(sample_data(Legionella_ps)@.Data) colnames(df_Legionella) <- sample_data(Legionella_ps)@names head(df_Legionella) df_Legionella$log_samplesums <- log(df_Legionella$samplesums) df_Legionella$log_samplesums_add1 <- log(df_Legionella$samplesums +1) # same on rarefied data sample_data(Legionella_ps_rare)$samplesums <- sample_sums(Legionella_ps_rare) df_Legionella_rare <- as.data.frame(sample_data(Legionella_ps_rare)@.Data) colnames(df_Legionella_rare) <- sample_data(Legionella_ps_rare)@names head(df_Legionella_rare) df_Legionella_rare$log_samplesums <- log(df_Legionella_rare$samplesums) df_Legionella_rare$log_samplesums_add1 <- log(df_Legionella_rare$samplesums +1) # add total coverage to Legionella df Legionella_samples <- c(as.character(df_Legionella$Sample)) ss_psdata_Legionella <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Legionella_samples) df_Legionella$samplesums_total <- sample_sums(ss_psdata_Legionella) df_Legionella$ratio_ss <- df_Legionella$samplesums/df_Legionella$samplesums_total df_Legionella$log_ratio <- log(df_Legionella$samplesums + 1) - log(df_Legionella$samplesums_total + 1) #plot ggplot(df_Legionella, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Legionella_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Legionella$date <- paste0(df_Legionella$Year, "-",df_Legionella$Month,"-",df_Legionella$Day) df_Legionella$date <- as.Date(df_Legionella$date, format = "%Y-%m-%d") df_Legionella_rare$date <- paste0(df_Legionella_rare$Year, "-",df_Legionella_rare$Month,"-",df_Legionella_rare$Day) df_Legionella_rare$date <- as.Date(df_Legionella_rare$date, format = "%Y-%m-%d") # linear regression lm_Legionella <- lm(data=df_Legionella, log_ratio ~ date + Site) anova(lm_Legionella) summary(lm_Legionella) # model assumptions hist(resid(lm_Legionella)) plot(resid(lm_Legionella) ~ fitted(lm_Legionella)) library(PMCMR) attach(df_Legionella) ph_kwtest_Legionella <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Legionella, file = "Stats_Legionella_logRatio_Site.txt") detach(df_Legionella) # Legionella rarefied data # linear regression lm_Legionella_rare <- lm(data=df_Legionella_rare, log_samplesums_add1 ~ date + Site) anova(lm_Legionella_rare) summary(lm_Legionella_rare) # model assumptions hist(resid(lm_Legionella_rare)) plot(resid(lm_Legionella_rare) ~ fitted(lm_Legionella_rare)) library(PMCMR) attach(df_Legionella_rare) ph_kwtest_Legionella_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Legionella_rare, file = "Stats_Legionella_logCount_rare18059_Site.txt") detach(df_Legionella_rare) kruskal_Legionella_rare <- kruskal.test(df_Legionella_rare$log_samplesums_add1, df_Legionella$Site) capture.output(kruskal_Legionella_rare, file = "kruskal_Legionella_rare.txt") #13 Leptospira ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Leptospira_ps)$samplesums <- sample_sums(Leptospira_ps) df_Leptospira <- as.data.frame(sample_data(Leptospira_ps)@.Data) colnames(df_Leptospira) <- sample_data(Leptospira_ps)@names head(df_Leptospira) df_Leptospira$log_samplesums <- log(df_Leptospira$samplesums) df_Leptospira$log_samplesums_add1 <- log(df_Leptospira$samplesums +1) # same on rarefied data sample_data(Leptospira_ps_rare)$samplesums <- sample_sums(Leptospira_ps_rare) df_Leptospira_rare <- as.data.frame(sample_data(Leptospira_ps_rare)@.Data) colnames(df_Leptospira_rare) <- sample_data(Leptospira_ps_rare)@names head(df_Leptospira_rare) df_Leptospira_rare$log_samplesums <- log(df_Leptospira_rare$samplesums) df_Leptospira_rare$log_samplesums_add1 <- log(df_Leptospira_rare$samplesums +1) # add total coverage to Leptospira df Leptospira_samples <- c(as.character(df_Leptospira$Sample)) ss_psdata_Leptospira <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Leptospira_samples) df_Leptospira$samplesums_total <- sample_sums(ss_psdata_Leptospira) df_Leptospira$ratio_ss <- df_Leptospira$samplesums/df_Leptospira$samplesums_total df_Leptospira$log_ratio <- log(df_Leptospira$samplesums + 1) - log(df_Leptospira$samplesums_total + 1) #plot ggplot(df_Leptospira, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Leptospira_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Leptospira$date <- paste0(df_Leptospira$Year, "-",df_Leptospira$Month,"-",df_Leptospira$Day) df_Leptospira$date <- as.Date(df_Leptospira$date, format = "%Y-%m-%d") df_Leptospira_rare$date <- paste0(df_Leptospira_rare$Year, "-",df_Leptospira_rare$Month,"-",df_Leptospira_rare$Day) df_Leptospira_rare$date <- as.Date(df_Leptospira_rare$date, format = "%Y-%m-%d") # linear regression lm_Leptospira <- lm(data=df_Leptospira, log_ratio ~ date + Site) anova(lm_Leptospira) summary(lm_Leptospira) # model assumptions hist(resid(lm_Leptospira)) plot(resid(lm_Leptospira) ~ fitted(lm_Leptospira)) library(PMCMR) attach(df_Leptospira) ph_kwtest_Leptospira <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Leptospira, file = "Stats_Leptospira_logRatio_Site.txt") detach(df_Leptospira) # Leptospira rarefied data # linear regression lm_Leptospira_rare <- lm(data=df_Leptospira_rare, log_samplesums_add1 ~ date + Site) anova(lm_Leptospira_rare) summary(lm_Leptospira_rare) # model assumptions hist(resid(lm_Leptospira_rare)) plot(resid(lm_Leptospira_rare) ~ fitted(lm_Leptospira_rare)) library(PMCMR) attach(df_Leptospira_rare) ph_kwtest_Leptospira_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Leptospira_rare, file = "Stats_Leptospira_logCount_rare18059_Site.txt") detach(df_Leptospira_rare) kruskal_Leptospira_rare <- kruskal.test(df_Leptospira_rare$log_samplesums_add1, df_Leptospira$Site) capture.output(kruskal_Leptospira_rare, file = "kruskal_Leptospira_rare.txt") ################################ R -- DESeq2 differential abundance analysis of ASVs ################################ ## calculate differential abundance of ASVs for H N C sites [ASV level!] # based on beta diversity patterns we aim to find the underlying # differences in bacterial taxa that drive beta diversity # input data psdata_IVER_WWTP <- subset_samples(psdata_IVER_cleaned_noNA, Site != "up" & Site != "down" & Site !="control") psdata_IVER_WWTP # subset all H N C samples HNC_psdata <- subset_samples(psdata_IVER_WWTP, Site == "H" | Site == "N" | Site == "C") # prune empty taxa slots HNC_psdata_pruned <- prune_taxa(taxa_sums(HNC_psdata) > 0, HNC_psdata) # create Deseq object from physeq object HNC_dds <- phyloseq_to_deseq2(HNC_psdata_pruned, ~ Site) # Esitmate size factors and dispersions HNC_dds <- estimateSizeFactors(HNC_dds) HNC_dds <- estimateDispersions(HNC_dds) # plot dispersions deseq_var_HCN <- deseq_var(HNC_psdata_pruned, "Site") plot_dispersion(deseq_var_HCN$datadf, deseq_var_HCN$fitdf) # apply VarianceStabilizingTransformation HNC_dds <- varianceStabilizingTransformation(HNC_dds) dim(HNC_dds) # store original non-transformed otu table in updown_prunedpruned_0 and the object to add VST_table in updown_prunedpruned_vst HNC_psdata_pruned_0 <- HNC_psdata_pruned HNC_psdata_pruned_vst <- HNC_psdata_pruned otu_table(HNC_psdata_pruned_0)[1:5,1:5] # store variance stabilised transformed otu table in soildata_vst otu_table(HNC_psdata_pruned_vst) <- otu_table(assay(HNC_dds), taxa_are_rows = TRUE) otu_table(HNC_psdata_pruned_vst)[1:5,1:5] # set negative transformed values to ZERO (as they are estimated to be close to zero based on the fitted NegBin model: the more negative, the more likely to be true 0) otu_table(HNC_psdata_pruned_vst)[otu_table(HNC_psdata_pruned_vst) < 0] <- 0 otu_table(HNC_psdata_pruned_vst)[1:5,1:5] HNC_dds_vst <- phyloseq_to_deseq2(HNC_psdata_pruned_vst, ~ Site) #geoMeans <- apply(counts(updown_dds_G), 1, gm_mean) HNC_dds_vst <- estimateSizeFactors(HNC_dds_vst) HNC_dds_vst <- estimateDispersions(HNC_dds_vst, fitType = "local") HNC_dds_vst ## Differential abundance testing GENUS library(DESeq2) design(HNC_dds_vst) HNC_dds_vst_DA <- DESeq(HNC_dds_vst) resultsNames(HNC_dds_vst_DA) res_HC <- results(HNC_dds_vst_DA, contrast = c("Site", "H", "C")) res_HN <- results(HNC_dds_vst_DA, contrast = c("Site", "H", "N")) res_NC <- results(HNC_dds_vst_DA, contrast = c("Site", "N", "C")) summary(res_HC) summary(res_HN) summary(res_NC) # remove NAs that appeared because of low Normalized counts (power too low for robust DA estimation) res_HC_nona <- res_HC[!is.na(res_HC$padj),] res_HN_nona <- res_HN[!is.na(res_HN$padj),] res_NC_nona <- res_NC[!is.na(res_NC$padj),] ### ASVs in higher abundance in H & N vs. C # Filter rownames(lfc positive values) %in% H & N --> intersect sigtab_HC_pos <- subset(sigtab_HC, log2FoldChange > 0) sigtab_NC_pos <- subset(sigtab_NC, log2FoldChange > 0) sigtab_HC_neg <- subset(sigtab_HC, log2FoldChange < 0) sigtab_NC_neg <- subset(sigtab_NC, log2FoldChange < 0) # more abunant taxa in clinical samples (HN vs. C) list_HN_vs_C <- intersect(rownames(sigtab_HC_pos), rownames(sigtab_NC_pos)) length(list_HN_vs_C) # 207 HN vs C # more abunant taxa in municipal samples (C vs. HN) list_HN_vs_C_neg <- intersect(rownames(sigtab_HC_neg), rownames(sigtab_NC_neg)) length(list_HN_vs_C_neg) # 128 C vs HN sigtab_HN_vs_C <- subset(sigtab_HC, rownames(sigtab_HC) %in% list_HN_vs_C) sigtab_HN_vs_C <- sigtab_HN_vs_C[order(rownames(sigtab_HN_vs_C)),] sigtab_N_vs_C <- subset(sigtab_NC, rownames(sigtab_NC) %in% list_HN_vs_C) sigtab_N_vs_C <- sigtab_N_vs_C[order(rownames(sigtab_N_vs_C)),] HNC_psdata_pruned_rel <- transform_sample_counts(HNC_psdata_pruned, function(x) x/sum(x)*100) HNC_psdata_pruned_rel <- subset_taxa(HNC_psdata_pruned_rel, taxa_names(HNC_psdata_pruned_rel) %in% list_HN_vs_C) HNC_psdata_pruned_rel_H <- subset_samples(HNC_psdata_pruned_rel, Site == "H") HNC_psdata_pruned_rel_N <- subset_samples(HNC_psdata_pruned_rel, Site == "N") HNC_psdata_pruned_rel_C <- subset_samples(HNC_psdata_pruned_rel, Site == "C") HNC_comp_data <- as.data.frame(rowMeans(otu_table(HNC_psdata_pruned_rel_H))) HNC_comp_data$mean_N <- c(rowMeans(otu_table(HNC_psdata_pruned_rel_N))) HNC_comp_data$mean_C <- c(rowMeans(otu_table(HNC_psdata_pruned_rel_C))) names(HNC_comp_data)[1] <- "mean_H" head(HNC_comp_data) HNC_comp_data <- HNC_comp_data[sort(rownames(HNC_comp_data)),] identical(rownames(HNC_comp_data), rownames(sigtab_HN_vs_C)) # ok HNC_comp_data_HClfc <- cbind(HNC_comp_data, sigtab_HN_vs_C[,c(2,6:13)]) dim(HNC_comp_data_HClfc) HNC_comp_data_HClfc <- HNC_comp_data_HClfc[,c(1:12)] colnames(HNC_comp_data_HClfc) identical(rownames(HNC_comp_data_HClfc), rownames(sigtab_N_vs_C)) #True HNC_comp_data_HClfc$log2fc_NC <- sigtab_N_vs_C$log2FoldChange names(HNC_comp_data_HClfc) # write summary table HNC_Genus_comp_data_HNvsC write.table(HNC_comp_data_HClfc, file = "Relative_abundances_ASVs_HCN_AND_Log2FoldChanges_HC_and_NC.txt", sep = "\t", dec = ".") library(ggplot2) # HN vs C pdf("Relabund_HN_vs_C_log2fc_alpha.pdf", width = 9, height = 6, useDingbats = F) ggplot(HNC_comp_data_HClfc, aes(x=mean_C+0.001, y=mean_H+0.001, color=Phylum, size = log2FoldChange)) + geom_point(aes(alpha = log2FoldChange)) + scale_y_log10() + scale_x_log10() + scale_size_continuous(trans = "exp") + geom_text(aes(label = ifelse(log2FoldChange > 4, as.character(paste0(Genus, " ", Species)),"")), hjust = -0.1, vjust = 0) dev.off() ###### On significantly enriched ASVs in HN vs C ---> now perfrom test between C and I. ####### ## calculate differential abundance of ASVs for H N C sites [ASV level!] # based on differential abundance of taxa between HN ('clinical') and C we aim to find the underlying # differences in 'clinical' bacterial taxa that differ between I and C # input data psdata_IVER_WWTP # check levels(sample_data(psdata_IVER_WWTP)$Site) # includes I and C # list of taxa significantly differentially abundant between HN and C list_HN_vs_C # subset samples for I and C psdata_IVER_CI<- subset_samples(psdata_IVER_WWTP, Site == "I" | Site == "C") # subset taxa typical for HN psdata_IVER_CI_clinical <- prune_taxa(taxa_names(psdata_IVER_CI) %in% list_HN_vs_C, psdata_IVER_CI) # check counts for each ASV > 0? --> 13 taxa typical for HN (vs C) are not found in IC psdata_IVER_CI_clinical_pruned <- prune_taxa(taxa_sums(psdata_IVER_CI_clinical) > 0, psdata_IVER_CI_clinical) ### DESeq2 ### # create Deseq object from physeq object CI_clinical_dds <- phyloseq_to_deseq2(psdata_IVER_CI_clinical_pruned, ~ Site) # Esitmate size factors and dispersions CI_clinical_dds <- estimateSizeFactors(CI_clinical_dds) CI_clinical_dds <- estimateDispersions(CI_clinical_dds) # plot dispersions deseq_var_CI_clinical <- deseq_var(psdata_IVER_CI_clinical_pruned, "Site") plot_dispersion(deseq_var_CI_clinical$datadf, deseq_var_CI_clinical$fitdf) # apply VarianceStabilizingTransformation CI_clinical_dds_vst <- varianceStabilizingTransformation(CI_clinical_dds) dim(CI_clinical_dds_vst) # store original non-transformed otu table in updown_prunedpruned_0 and the object to add VST_table in updown_prunedpruned_vst psdata_IVER_CI_clinical_pruned_0 <- psdata_IVER_CI_clinical_pruned psdata_IVER_CI_clinical_pruned_vst <- psdata_IVER_CI_clinical_pruned otu_table(psdata_IVER_CI_clinical_pruned_0)[1:5,1:5] # store variance stabilised transformed otu table in soildata_vst otu_table(psdata_IVER_CI_clinical_pruned_vst) <- otu_table(assay(CI_clinical_dds_vst), taxa_are_rows = TRUE) otu_table(psdata_IVER_CI_clinical_pruned_vst)[1:5,1:5] # set negative transformed values to ZERO (as they are estimated to be close to zero based on the fitted NegBin model: the more negative, the more likely to be true 0) otu_table(psdata_IVER_CI_clinical_pruned_vst)[otu_table(psdata_IVER_CI_clinical_pruned_vst) < 0] <- 0 otu_table(psdata_IVER_CI_clinical_pruned_vst)[1:5,1:5] CI_clinical_dds_vst <- phyloseq_to_deseq2(psdata_IVER_CI_clinical_pruned_vst, ~ Site) #geoMeans <- apply(counts(updown_dds_G), 1, gm_mean) CI_clinical_dds_vst <- estimateSizeFactors(CI_clinical_dds_vst) CI_clinical_dds_vst <- estimateDispersions(CI_clinical_dds_vst, fitType = "local") CI_clinical_dds_vst ## Differential abundance testing GENUS library(DESeq2) design(CI_clinical_dds_vst) CI_clinical_dds_vst_DA <- DESeq(CI_clinical_dds_vst) resultsNames(CI_clinical_dds_vst_DA) # Results DESEq2 res_IC <- results(CI_clinical_dds_vst_DA, contrast = c("Site", "I", "C")) summary(res_IC) # Filter results # remove NAs that appeared because of low Normalized counts (power too low for robust DA estimation) res_IC_nona <- res_IC[!is.na(res_IC$padj),] # order taxa by foldchange and p values alpha = 0.1 # HC comparison keepOTUs_IC <- res_IC_nona[res_IC_nona$padj < alpha, ] keepOTUs_IC_lfc_order <- keepOTUs_IC[order(keepOTUs_IC$log2FoldChange, na.last = NA), ] sigtab_IC <- as(keepOTUs_IC_lfc_order[which(keepOTUs_IC_lfc_order$padj < alpha),], "data.frame") sigtab_IC <- cbind(as.matrix(sigtab_IC), as.matrix(tax_table(psdata_IVER_CI_clinical_pruned_vst)[rownames(sigtab_IC),])) sigtab_IC <- data.frame(sigtab_IC) head(sigtab_IC) str(sigtab_IC) # change str of columns for(i in 1:6){sigtab_IC[,i] <- as.numeric(as.character(sigtab_IC[,i]))} for(i in 7:13){sigtab_IC[,i] <- as.factor(as.character(sigtab_IC[,i]))} mode(sigtab_IC) ### ASVs in higher abundance in I vs. C # Filter rownames(lfc positive values [higher in I] & lfc negative values [higher in C]) sigtab_IC_pos <- subset(sigtab_IC, log2FoldChange > 0) sigtab_IC_neg <- subset(sigtab_IC, log2FoldChange < 0) #sigtab_HN_vs_C <- subset(sigtab_HC, rownames(sigtab_HC) %in% list_HN_vs_C) #sigtab_HN_vs_C <- sigtab_HN_vs_C[order(rownames(sigtab_HN_vs_C)),] #sigtab_N_vs_C <- subset(sigtab_NC, rownames(sigtab_NC) %in% list_HN_vs_C) #sigtab_N_vs_C <- sigtab_N_vs_C[order(rownames(sigtab_N_vs_C)),] # calculate relative abundances for plotting psdata_IVER_CI_pruned <- prune_taxa(taxa_sums(psdata_IVER_CI) >0, psdata_IVER_CI) psdata_IVER_CI_pruned_rel <- transform_sample_counts(psdata_IVER_CI_pruned, function(x) x/sum(x)*100) psdata_IVER_CI_pruned_rel <- subset_taxa(psdata_IVER_CI_pruned_rel, taxa_names(psdata_IVER_CI_pruned_rel) %in% rownames(sigtab_IC)) psdata_IVER_CI_pruned_rel_I <- subset_samples(psdata_IVER_CI_pruned_rel, Site == "I") psdata_IVER_CI_pruned_rel_C <- subset_samples(psdata_IVER_CI_pruned_rel, Site == "C") # calculate mean values per sample type for each ASV CI_clinical_comp_data_IC <- as.data.frame(rowMeans(otu_table(psdata_IVER_CI_pruned_rel_I))) CI_clinical_comp_data_IC$mean_C <- c(rowMeans(otu_table(psdata_IVER_CI_pruned_rel_C))) names(CI_clinical_comp_data_IC)[1] <- "mean_I" # stratify rownames order between psdata and sigtab head(CI_clinical_comp_data_IC) CI_clinical_comp_data_IC <- CI_clinical_comp_data_IC[sort(rownames(CI_clinical_comp_data_IC)),] sigtab_IC <- sigtab_IC[sort(rownames(sigtab_IC)),] identical(rownames(CI_clinical_comp_data_IC), rownames(sigtab_IC)) # ok CI_clinical_comp_data_IClfc <- cbind(CI_clinical_comp_data_IC, sigtab_IC[,c(2,6:13)]) dim(CI_clinical_comp_data_IClfc) colnames(CI_clinical_comp_data_IClfc) # write summary table write.table(CI_clinical_comp_data_IClfc, file = "Relative_abundances_ASVs_IC_AND_Log2FoldChanges_IC.txt", sep = "\t", dec = ".") library(ggplot2) # I vs C for only HN typical 'clinical' ASVs (based on DESeq2) pdf("Relabund_ClinicalASVs_I_vs_C_log2fc_alpha.pdf", width = 9, height = 6, useDingbats = F) ggplot(CI_clinical_comp_data_IClfc, aes(x=mean_C+0.001, y=mean_I+0.001, color=Phylum)) + geom_point(aes(alpha = log2FoldChange, size = log2FoldChange)) + scale_y_log10() + scale_x_log10() + scale_size_continuous(trans = "exp") + geom_text(aes(label = as.character(paste0(Genus, " ", Species)), hjust = -0.1, vjust = 0)) + geom_abline(intercept = 0, slope = 1, color="grey50", size=0.5) dev.off() capture.output(CI_clinical_comp_data_IClfc, file = "Stats_CI_clinical_comp_data_IClfc.txt") ################################ R -- Differential abundance analysis of 8 focal taxa: Clinical enriched taxa in I ################################ ## Rarefy data to minimum sampling depth = 18059 reads per sample # input data set.seed(711) psdata_IVER_cleaned_noNA_18059 <- rarefy_even_depth(psdata_IVER_cleaned_noNA, min(sample_sums(psdata_IVER_cleaned_noNA)), rngseed = 711) psdata_IVER_cleaned_noNA_18059 ##### library(phyloseq) library(ggplot2) # taxon specific data frames #1 Arcobacter Arcobacter_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Arcobacter") Arcobacter_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Arcobacter") #2 Cloacibacterium Cloacibacterium_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Cloacibacterium") Cloacibacterium_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Cloacibacterium") #3 Desulfovibrio Desulfovibrio_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Desulfovibrio") Desulfovibrio_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Desulfovibrio") #4 Mycobacterium Lactobacillus_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Lactobacillus") Lactobacillus_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Lactobacillus") #5 Paludibacter Paludibacter_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Paludibacter") Paludibacter_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Paludibacter") #6 Prevotella Prevotella_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Prevotella") Prevotella_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Prevotella") #7 Selenomonas Selenomonas_ps<- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Selenomonas") Selenomonas_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Selenomonas") #8 Tolumonas Tolumonas_ps <- subset_taxa(psdata_IVER_cleaned_noNA, Genus == "Tolumonas") Tolumonas_ps_rare <- subset_taxa(psdata_IVER_cleaned_noNA_18059, Genus == "Tolumonas") ## adjust data frame parameters #1 sample_data(Arcobacter_ps)$Site <- factor(sample_data(Arcobacter_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Arcobacter_ps_rare)$Site <- factor(sample_data(Arcobacter_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #2 sample_data(Cloacibacterium_ps)$Site <- factor(sample_data(Cloacibacterium_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Cloacibacterium_ps_rare)$Site <- factor(sample_data(Cloacibacterium_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #3 sample_data(Desulfovibrio_ps)$Site <- factor(sample_data(Desulfovibrio_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Desulfovibrio_ps_rare)$Site <- factor(sample_data(Desulfovibrio_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #4 sample_data(Lactobacillus_ps)$Site <- factor(sample_data(Lactobacillus_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Lactobacillus_ps_rare)$Site <- factor(sample_data(Lactobacillus_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #5 sample_data(Paludibacter_ps)$Site <- factor(sample_data(Paludibacter_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Paludibacter_ps_rare)$Site <- factor(sample_data(Paludibacter_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #6 sample_data(Prevotella_ps)$Site <- factor(sample_data(Prevotella_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Prevotella_ps_rare)$Site <- factor(sample_data(Prevotella_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #7 sample_data(Selenomonas_ps)$Site <- factor(sample_data(Selenomonas_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Selenomonas_ps_rare)$Site <- factor(sample_data(Selenomonas_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #8 sample_data(Tolumonas_ps)$Site <- factor(sample_data(Tolumonas_ps)$Site, levels = c("H","N","C","I","E","up","down","control") ) sample_data(Tolumonas_ps_rare)$Site <- factor(sample_data(Tolumonas_ps_rare)$Site, levels = c("H","N","C","I","E","up","down","control") ) #1 Arcobacter ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Arcobacter_ps)$samplesums <- sample_sums(Arcobacter_ps) df_Arcobacter <- as.data.frame(sample_data(Arcobacter_ps)@.Data) colnames(df_Arcobacter) <- sample_data(Arcobacter_ps)@names head(df_Arcobacter) df_Arcobacter$log_samplesums <- log(df_Arcobacter$samplesums) df_Arcobacter$log_samplesums_add1 <- log(df_Arcobacter$samplesums +1) # same on rarefied data sample_data(Arcobacter_ps_rare)$samplesums <- sample_sums(Arcobacter_ps_rare) df_Arcobacter_rare <- as.data.frame(sample_data(Arcobacter_ps_rare)@.Data) colnames(df_Arcobacter_rare) <- sample_data(Arcobacter_ps_rare)@names head(df_Arcobacter_rare) df_Arcobacter_rare$log_samplesums <- log(df_Arcobacter_rare$samplesums) df_Arcobacter_rare$log_samplesums_add1 <- log(df_Arcobacter_rare$samplesums +1) # add total coverage to Arcobacter df Arcobacter_samples <- c(as.character(df_Arcobacter$Sample)) ss_psdata_Arcobacter <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Arcobacter_samples) df_Arcobacter$samplesums_total <- sample_sums(ss_psdata_Arcobacter) df_Arcobacter$ratio_ss <- df_Arcobacter$samplesums/df_Arcobacter$samplesums_total df_Arcobacter$log_ratio <- log(df_Arcobacter$samplesums + 1) - log(df_Arcobacter$samplesums_total + 1) #plot ggplot(df_Arcobacter, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Arcobacter_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Arcobacter$date <- paste0(df_Arcobacter$Year, "-",df_Arcobacter$Month,"-",df_Arcobacter$Day) df_Arcobacter$date <- as.Date(df_Arcobacter$date, format = "%Y-%m-%d") df_Arcobacter_rare$date <- paste0(df_Arcobacter_rare$Year, "-",df_Arcobacter_rare$Month,"-",df_Arcobacter_rare$Day) df_Arcobacter_rare$date <- as.Date(df_Arcobacter_rare$date, format = "%Y-%m-%d") # linear regression lm_Arcobacter <- lm(data=df_Arcobacter, log_ratio ~ date + Site) anova(lm_Arcobacter) summary(lm_Arcobacter) # model assumptions hist(resid(lm_Arcobacter)) plot(resid(lm_Arcobacter) ~ fitted(lm_Arcobacter)) library(PMCMR) attach(df_Arcobacter) ph_kwtest_Arcobacter <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Arcobacter, file = "Stats_Arcobacter_logRatio_Site.txt") detach(df_Arcobacter) # Arcobacter rarefied data # linear regression lm_Arcobacter_rare <- lm(data=df_Arcobacter_rare, log_samplesums_add1 ~ date + Site) anova(lm_Arcobacter_rare) summary(lm_Arcobacter_rare) # model assumptions hist(resid(lm_Arcobacter_rare)) plot(resid(lm_Arcobacter_rare) ~ fitted(lm_Arcobacter_rare)) library(PMCMR) attach(df_Arcobacter_rare) ph_kwtest_Arcobacter_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Arcobacter_rare, file = "Stats_Arcobacter_logCount_rare18059_Site.txt") detach(df_Arcobacter_rare) kruskal_Arcobacter_rare <- kruskal.test(df_Arcobacter_rare$log_samplesums_add1, df_Arcobacter$Site) capture.output(kruskal_Arcobacter_rare, file = "kruskal_Arcobacter_rare.txt") #2 Cloacibacterium ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Cloacibacterium_ps)$samplesums <- sample_sums(Cloacibacterium_ps) df_Cloacibacterium <- as.data.frame(sample_data(Cloacibacterium_ps)@.Data) colnames(df_Cloacibacterium) <- sample_data(Cloacibacterium_ps)@names head(df_Cloacibacterium) df_Cloacibacterium$log_samplesums <- log(df_Cloacibacterium$samplesums) df_Cloacibacterium$log_samplesums_add1 <- log(df_Cloacibacterium$samplesums +1) # same on rarefied data sample_data(Cloacibacterium_ps_rare)$samplesums <- sample_sums(Cloacibacterium_ps_rare) df_Cloacibacterium_rare <- as.data.frame(sample_data(Cloacibacterium_ps_rare)@.Data) colnames(df_Cloacibacterium_rare) <- sample_data(Cloacibacterium_ps_rare)@names head(df_Cloacibacterium_rare) df_Cloacibacterium_rare$log_samplesums <- log(df_Cloacibacterium_rare$samplesums) df_Cloacibacterium_rare$log_samplesums_add1 <- log(df_Cloacibacterium_rare$samplesums +1) # add total coverage to Cloacibacterium df Cloacibacterium_samples <- c(as.character(df_Cloacibacterium$Sample)) ss_psdata_Cloacibacterium <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Cloacibacterium_samples) df_Cloacibacterium$samplesums_total <- sample_sums(ss_psdata_Cloacibacterium) df_Cloacibacterium$ratio_ss <- df_Cloacibacterium$samplesums/df_Cloacibacterium$samplesums_total df_Cloacibacterium$log_ratio <- log(df_Cloacibacterium$samplesums + 1) - log(df_Cloacibacterium$samplesums_total + 1) #plot ggplot(df_Cloacibacterium, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Cloacibacterium_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Cloacibacterium$date <- paste0(df_Cloacibacterium$Year, "-",df_Cloacibacterium$Month,"-",df_Cloacibacterium$Day) df_Cloacibacterium$date <- as.Date(df_Cloacibacterium$date, format = "%Y-%m-%d") df_Cloacibacterium_rare$date <- paste0(df_Cloacibacterium_rare$Year, "-",df_Cloacibacterium_rare$Month,"-",df_Cloacibacterium_rare$Day) df_Cloacibacterium_rare$date <- as.Date(df_Cloacibacterium_rare$date, format = "%Y-%m-%d") # linear regression lm_Cloacibacterium <- lm(data=df_Cloacibacterium, log_ratio ~ date + Site) anova(lm_Cloacibacterium) summary(lm_Cloacibacterium) # model assumptions hist(resid(lm_Cloacibacterium)) plot(resid(lm_Cloacibacterium) ~ fitted(lm_Cloacibacterium)) library(PMCMR) attach(df_Cloacibacterium) ph_kwtest_Cloacibacterium <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Cloacibacterium, file = "Stats_Cloacibacterium_logRatio_Site.txt") detach(df_Cloacibacterium) # Cloacibacterium rarefied data # linear regression lm_Cloacibacterium_rare <- lm(data=df_Cloacibacterium_rare, log_samplesums_add1 ~ date + Site) anova(lm_Cloacibacterium_rare) summary(lm_Cloacibacterium_rare) # model assumptions hist(resid(lm_Cloacibacterium_rare)) plot(resid(lm_Cloacibacterium_rare) ~ fitted(lm_Cloacibacterium_rare)) library(PMCMR) attach(df_Cloacibacterium_rare) ph_kwtest_Cloacibacterium_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Cloacibacterium_rare, file = "Stats_Cloacibacterium_logCount_rare18059_Site.txt") detach(df_Cloacibacterium_rare) kruskal_Cloacibacterium_rare <- kruskal.test(df_Cloacibacterium_rare$log_samplesums_add1, df_Cloacibacterium$Site) capture.output(kruskal_Cloacibacterium_rare, file = "kruskal_Cloacibacterium_rare.txt") #3 Desulfovibrio ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Desulfovibrio_ps)$samplesums <- sample_sums(Desulfovibrio_ps) df_Desulfovibrio <- as.data.frame(sample_data(Desulfovibrio_ps)@.Data) colnames(df_Desulfovibrio) <- sample_data(Desulfovibrio_ps)@names head(df_Desulfovibrio) df_Desulfovibrio$log_samplesums <- log(df_Desulfovibrio$samplesums) df_Desulfovibrio$log_samplesums_add1 <- log(df_Desulfovibrio$samplesums +1) # same on rarefied data sample_data(Desulfovibrio_ps_rare)$samplesums <- sample_sums(Desulfovibrio_ps_rare) df_Desulfovibrio_rare <- as.data.frame(sample_data(Desulfovibrio_ps_rare)@.Data) colnames(df_Desulfovibrio_rare) <- sample_data(Desulfovibrio_ps_rare)@names head(df_Desulfovibrio_rare) df_Desulfovibrio_rare$log_samplesums <- log(df_Desulfovibrio_rare$samplesums) df_Desulfovibrio_rare$log_samplesums_add1 <- log(df_Desulfovibrio_rare$samplesums +1) # add total coverage to Desulfovibrio df Desulfovibrio_samples <- c(as.character(df_Desulfovibrio$Sample)) ss_psdata_Desulfovibrio <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Desulfovibrio_samples) df_Desulfovibrio$samplesums_total <- sample_sums(ss_psdata_Desulfovibrio) df_Desulfovibrio$ratio_ss <- df_Desulfovibrio$samplesums/df_Desulfovibrio$samplesums_total df_Desulfovibrio$log_ratio <- log(df_Desulfovibrio$samplesums + 1) - log(df_Desulfovibrio$samplesums_total + 1) #plot ggplot(df_Desulfovibrio, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Desulfovibrio_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Desulfovibrio$date <- paste0(df_Desulfovibrio$Year, "-",df_Desulfovibrio$Month,"-",df_Desulfovibrio$Day) df_Desulfovibrio$date <- as.Date(df_Desulfovibrio$date, format = "%Y-%m-%d") df_Desulfovibrio_rare$date <- paste0(df_Desulfovibrio_rare$Year, "-",df_Desulfovibrio_rare$Month,"-",df_Desulfovibrio_rare$Day) df_Desulfovibrio_rare$date <- as.Date(df_Desulfovibrio_rare$date, format = "%Y-%m-%d") # linear regression lm_Desulfovibrio <- lm(data=df_Desulfovibrio, log_ratio ~ date + Site) anova(lm_Desulfovibrio) summary(lm_Desulfovibrio) # model assumptions hist(resid(lm_Desulfovibrio)) plot(resid(lm_Desulfovibrio) ~ fitted(lm_Desulfovibrio)) library(PMCMR) attach(df_Desulfovibrio) ph_kwtest_Desulfovibrio <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Desulfovibrio, file = "Stats_Desulfovibrio_logRatio_Site.txt") detach(df_Desulfovibrio) # Desulfovibrio rarefied data # linear regression lm_Desulfovibrio_rare <- lm(data=df_Desulfovibrio_rare, log_samplesums_add1 ~ date + Site) anova(lm_Desulfovibrio_rare) summary(lm_Desulfovibrio_rare) # model assumptions hist(resid(lm_Desulfovibrio_rare)) plot(resid(lm_Desulfovibrio_rare) ~ fitted(lm_Desulfovibrio_rare)) library(PMCMR) attach(df_Desulfovibrio_rare) ph_kwtest_Desulfovibrio_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Desulfovibrio_rare, file = "Stats_Desulfovibrio_logCount_rare18059_Site.txt") detach(df_Desulfovibrio_rare) kruskal_Desulfovibrio_rare <- kruskal.test(df_Desulfovibrio_rare$log_samplesums_add1, df_Desulfovibrio$Site) capture.output(kruskal_Desulfovibrio_rare, file = "kruskal_Desulfovibrio_rare.txt") #4 Lactobacillus ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Lactobacillus_ps)$samplesums <- sample_sums(Lactobacillus_ps) df_Lactobacillus <- as.data.frame(sample_data(Lactobacillus_ps)@.Data) colnames(df_Lactobacillus) <- sample_data(Lactobacillus_ps)@names head(df_Lactobacillus) df_Lactobacillus$log_samplesums <- log(df_Lactobacillus$samplesums) df_Lactobacillus$log_samplesums_add1 <- log(df_Lactobacillus$samplesums +1) # same on rarefied data sample_data(Lactobacillus_ps_rare)$samplesums <- sample_sums(Lactobacillus_ps_rare) df_Lactobacillus_rare <- as.data.frame(sample_data(Lactobacillus_ps_rare)@.Data) colnames(df_Lactobacillus_rare) <- sample_data(Lactobacillus_ps_rare)@names head(df_Lactobacillus_rare) df_Lactobacillus_rare$log_samplesums <- log(df_Lactobacillus_rare$samplesums) df_Lactobacillus_rare$log_samplesums_add1 <- log(df_Lactobacillus_rare$samplesums +1) # add total coverage to Lactobacillus df Lactobacillus_samples <- c(as.character(df_Lactobacillus$Sample)) ss_psdata_Lactobacillus <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Lactobacillus_samples) df_Lactobacillus$samplesums_total <- sample_sums(ss_psdata_Lactobacillus) df_Lactobacillus$ratio_ss <- df_Lactobacillus$samplesums/df_Lactobacillus$samplesums_total df_Lactobacillus$log_ratio <- log(df_Lactobacillus$samplesums + 1) - log(df_Lactobacillus$samplesums_total + 1) #plot ggplot(df_Lactobacillus, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Lactobacillus_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Lactobacillus$date <- paste0(df_Lactobacillus$Year, "-",df_Lactobacillus$Month,"-",df_Lactobacillus$Day) df_Lactobacillus$date <- as.Date(df_Lactobacillus$date, format = "%Y-%m-%d") df_Lactobacillus_rare$date <- paste0(df_Lactobacillus_rare$Year, "-",df_Lactobacillus_rare$Month,"-",df_Lactobacillus_rare$Day) df_Lactobacillus_rare$date <- as.Date(df_Lactobacillus_rare$date, format = "%Y-%m-%d") # linear regression lm_Lactobacillus <- lm(data=df_Lactobacillus, log_ratio ~ date + Site) anova(lm_Lactobacillus) summary(lm_Lactobacillus) # model assumptions hist(resid(lm_Lactobacillus)) plot(resid(lm_Lactobacillus) ~ fitted(lm_Lactobacillus)) library(PMCMR) attach(df_Lactobacillus) ph_kwtest_Lactobacillus <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Lactobacillus, file = "Stats_Lactobacillus_logRatio_Site.txt") detach(df_Lactobacillus) # Lactobacillus rarefied data # linear regression lm_Lactobacillus_rare <- lm(data=df_Lactobacillus_rare, log_samplesums_add1 ~ date + Site) anova(lm_Lactobacillus_rare) summary(lm_Lactobacillus_rare) # model assumptions hist(resid(lm_Lactobacillus_rare)) plot(resid(lm_Lactobacillus_rare) ~ fitted(lm_Lactobacillus_rare)) library(PMCMR) attach(df_Lactobacillus_rare) ph_kwtest_Lactobacillus_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Lactobacillus_rare, file = "Stats_Lactobacillus_logCount_rare18059_Site.txt") detach(df_Lactobacillus_rare) kruskal_Lactobacillus_rare <- kruskal.test(df_Lactobacillus_rare$log_samplesums_add1, df_Lactobacillus$Site) capture.output(kruskal_Lactobacillus_rare, file = "kruskal_Lactobacillus_rare.txt") #5 Paludibacter ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Paludibacter_ps)$samplesums <- sample_sums(Paludibacter_ps) df_Paludibacter <- as.data.frame(sample_data(Paludibacter_ps)@.Data) colnames(df_Paludibacter) <- sample_data(Paludibacter_ps)@names head(df_Paludibacter) df_Paludibacter$log_samplesums <- log(df_Paludibacter$samplesums) df_Paludibacter$log_samplesums_add1 <- log(df_Paludibacter$samplesums +1) # same on rarefied data sample_data(Paludibacter_ps_rare)$samplesums <- sample_sums(Paludibacter_ps_rare) df_Paludibacter_rare <- as.data.frame(sample_data(Paludibacter_ps_rare)@.Data) colnames(df_Paludibacter_rare) <- sample_data(Paludibacter_ps_rare)@names head(df_Paludibacter_rare) df_Paludibacter_rare$log_samplesums <- log(df_Paludibacter_rare$samplesums) df_Paludibacter_rare$log_samplesums_add1 <- log(df_Paludibacter_rare$samplesums +1) # add total coverage to Paludibacter df Paludibacter_samples <- c(as.character(df_Paludibacter$Sample)) ss_psdata_Paludibacter <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Paludibacter_samples) df_Paludibacter$samplesums_total <- sample_sums(ss_psdata_Paludibacter) df_Paludibacter$ratio_ss <- df_Paludibacter$samplesums/df_Paludibacter$samplesums_total df_Paludibacter$log_ratio <- log(df_Paludibacter$samplesums + 1) - log(df_Paludibacter$samplesums_total + 1) #plot ggplot(df_Paludibacter, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Paludibacter_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Paludibacter$date <- paste0(df_Paludibacter$Year, "-",df_Paludibacter$Month,"-",df_Paludibacter$Day) df_Paludibacter$date <- as.Date(df_Paludibacter$date, format = "%Y-%m-%d") df_Paludibacter_rare$date <- paste0(df_Paludibacter_rare$Year, "-",df_Paludibacter_rare$Month,"-",df_Paludibacter_rare$Day) df_Paludibacter_rare$date <- as.Date(df_Paludibacter_rare$date, format = "%Y-%m-%d") # linear regression lm_Paludibacter <- lm(data=df_Paludibacter, log_ratio ~ date + Site) anova(lm_Paludibacter) summary(lm_Paludibacter) # model assumptions hist(resid(lm_Paludibacter)) plot(resid(lm_Paludibacter) ~ fitted(lm_Paludibacter)) library(PMCMR) attach(df_Paludibacter) ph_kwtest_Paludibacter <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Paludibacter, file = "Stats_Paludibacter_logRatio_Site.txt") detach(df_Paludibacter) # Paludibacter rarefied data # linear regression lm_Paludibacter_rare <- lm(data=df_Paludibacter_rare, log_samplesums_add1 ~ date + Site) anova(lm_Paludibacter_rare) summary(lm_Paludibacter_rare) # model assumptions hist(resid(lm_Paludibacter_rare)) plot(resid(lm_Paludibacter_rare) ~ fitted(lm_Paludibacter_rare)) library(PMCMR) attach(df_Paludibacter_rare) ph_kwtest_Paludibacter_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Paludibacter_rare, file = "Stats_Paludibacter_logCount_rare18059_Site.txt") detach(df_Paludibacter_rare) kruskal_Paludibacter_rare <- kruskal.test(df_Paludibacter_rare$log_samplesums_add1, df_Paludibacter$Site) capture.output(kruskal_Paludibacter_rare, file = "kruskal_Paludibacter_rare.txt") #6 Prevotella ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Prevotella_ps)$samplesums <- sample_sums(Prevotella_ps) df_Prevotella <- as.data.frame(sample_data(Prevotella_ps)@.Data) colnames(df_Prevotella) <- sample_data(Prevotella_ps)@names head(df_Prevotella) df_Prevotella$log_samplesums <- log(df_Prevotella$samplesums) df_Prevotella$log_samplesums_add1 <- log(df_Prevotella$samplesums +1) # same on rarefied data sample_data(Prevotella_ps_rare)$samplesums <- sample_sums(Prevotella_ps_rare) df_Prevotella_rare <- as.data.frame(sample_data(Prevotella_ps_rare)@.Data) colnames(df_Prevotella_rare) <- sample_data(Prevotella_ps_rare)@names head(df_Prevotella_rare) df_Prevotella_rare$log_samplesums <- log(df_Prevotella_rare$samplesums) df_Prevotella_rare$log_samplesums_add1 <- log(df_Prevotella_rare$samplesums +1) # add total coverage to Prevotella df Prevotella_samples <- c(as.character(df_Prevotella$Sample)) ss_psdata_Prevotella <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Prevotella_samples) df_Prevotella$samplesums_total <- sample_sums(ss_psdata_Prevotella) df_Prevotella$ratio_ss <- df_Prevotella$samplesums/df_Prevotella$samplesums_total df_Prevotella$log_ratio <- log(df_Prevotella$samplesums + 1) - log(df_Prevotella$samplesums_total + 1) #plot ggplot(df_Prevotella, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Prevotella_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Prevotella$date <- paste0(df_Prevotella$Year, "-",df_Prevotella$Month,"-",df_Prevotella$Day) df_Prevotella$date <- as.Date(df_Prevotella$date, format = "%Y-%m-%d") df_Prevotella_rare$date <- paste0(df_Prevotella_rare$Year, "-",df_Prevotella_rare$Month,"-",df_Prevotella_rare$Day) df_Prevotella_rare$date <- as.Date(df_Prevotella_rare$date, format = "%Y-%m-%d") # linear regression lm_Prevotella <- lm(data=df_Prevotella, log_ratio ~ date + Site) anova(lm_Prevotella) summary(lm_Prevotella) # model assumptions hist(resid(lm_Prevotella)) plot(resid(lm_Prevotella) ~ fitted(lm_Prevotella)) library(PMCMR) attach(df_Prevotella) ph_kwtest_Prevotella <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Prevotella, file = "Stats_Prevotella_logRatio_Site.txt") detach(df_Prevotella) # Prevotella rarefied data # linear regression lm_Prevotella_rare <- lm(data=df_Prevotella_rare, log_samplesums_add1 ~ date + Site) anova(lm_Prevotella_rare) summary(lm_Prevotella_rare) # model assumptions hist(resid(lm_Prevotella_rare)) plot(resid(lm_Prevotella_rare) ~ fitted(lm_Prevotella_rare)) library(PMCMR) attach(df_Prevotella_rare) ph_kwtest_Prevotella_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Prevotella_rare, file = "Stats_Prevotella_logCount_rare18059_Site.txt") detach(df_Prevotella_rare) kruskal_Prevotella_rare <- kruskal.test(df_Prevotella_rare$log_samplesums_add1, df_Prevotella$Site) capture.output(kruskal_Prevotella_rare, file = "kruskal_Prevotella_rare.txt") #7 Selenomonas ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Selenomonas_ps)$samplesums <- sample_sums(Selenomonas_ps) df_Selenomonas <- as.data.frame(sample_data(Selenomonas_ps)@.Data) colnames(df_Selenomonas) <- sample_data(Selenomonas_ps)@names head(df_Selenomonas) df_Selenomonas$log_samplesums <- log(df_Selenomonas$samplesums) df_Selenomonas$log_samplesums_add1 <- log(df_Selenomonas$samplesums +1) # same on rarefied data sample_data(Selenomonas_ps_rare)$samplesums <- sample_sums(Selenomonas_ps_rare) df_Selenomonas_rare <- as.data.frame(sample_data(Selenomonas_ps_rare)@.Data) colnames(df_Selenomonas_rare) <- sample_data(Selenomonas_ps_rare)@names head(df_Selenomonas_rare) df_Selenomonas_rare$log_samplesums <- log(df_Selenomonas_rare$samplesums) df_Selenomonas_rare$log_samplesums_add1 <- log(df_Selenomonas_rare$samplesums +1) # add total coverage to Selenomonas df Selenomonas_samples <- c(as.character(df_Selenomonas$Sample)) ss_psdata_Selenomonas <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Selenomonas_samples) df_Selenomonas$samplesums_total <- sample_sums(ss_psdata_Selenomonas) df_Selenomonas$ratio_ss <- df_Selenomonas$samplesums/df_Selenomonas$samplesums_total df_Selenomonas$log_ratio <- log(df_Selenomonas$samplesums + 1) - log(df_Selenomonas$samplesums_total + 1) #plot ggplot(df_Selenomonas, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Selenomonas_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Selenomonas$date <- paste0(df_Selenomonas$Year, "-",df_Selenomonas$Month,"-",df_Selenomonas$Day) df_Selenomonas$date <- as.Date(df_Selenomonas$date, format = "%Y-%m-%d") df_Selenomonas_rare$date <- paste0(df_Selenomonas_rare$Year, "-",df_Selenomonas_rare$Month,"-",df_Selenomonas_rare$Day) df_Selenomonas_rare$date <- as.Date(df_Selenomonas_rare$date, format = "%Y-%m-%d") # linear regression lm_Selenomonas <- lm(data=df_Selenomonas, log_ratio ~ date + Site) anova(lm_Selenomonas) summary(lm_Selenomonas) # model assumptions hist(resid(lm_Selenomonas)) plot(resid(lm_Selenomonas) ~ fitted(lm_Selenomonas)) library(PMCMR) attach(df_Selenomonas) ph_kwtest_Selenomonas <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Selenomonas, file = "Stats_Selenomonas_logRatio_Site.txt") detach(df_Selenomonas) # Selenomonas rarefied data # linear regression lm_Selenomonas_rare <- lm(data=df_Selenomonas_rare, log_samplesums_add1 ~ date + Site) anova(lm_Selenomonas_rare) summary(lm_Selenomonas_rare) # model assumptions hist(resid(lm_Selenomonas_rare)) plot(resid(lm_Selenomonas_rare) ~ fitted(lm_Selenomonas_rare)) library(PMCMR) attach(df_Selenomonas_rare) ph_kwtest_Selenomonas_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Selenomonas_rare, file = "Stats_Selenomonas_logCount_rare18059_Site.txt") detach(df_Selenomonas_rare) kruskal_Selenomonas_rare <- kruskal.test(df_Selenomonas_rare$log_samplesums_add1, df_Selenomonas$Site) capture.output(kruskal_Selenomonas_rare, file = "kruskal_Selenomonas_rare.txt") #8 Tolumonas ## mutate parameters to log(taxon reads+1) - log(total reads+1) sample_data(Tolumonas_ps)$samplesums <- sample_sums(Tolumonas_ps) df_Tolumonas <- as.data.frame(sample_data(Tolumonas_ps)@.Data) colnames(df_Tolumonas) <- sample_data(Tolumonas_ps)@names head(df_Tolumonas) df_Tolumonas$log_samplesums <- log(df_Tolumonas$samplesums) df_Tolumonas$log_samplesums_add1 <- log(df_Tolumonas$samplesums +1) # same on rarefied data sample_data(Tolumonas_ps_rare)$samplesums <- sample_sums(Tolumonas_ps_rare) df_Tolumonas_rare <- as.data.frame(sample_data(Tolumonas_ps_rare)@.Data) colnames(df_Tolumonas_rare) <- sample_data(Tolumonas_ps_rare)@names head(df_Tolumonas_rare) df_Tolumonas_rare$log_samplesums <- log(df_Tolumonas_rare$samplesums) df_Tolumonas_rare$log_samplesums_add1 <- log(df_Tolumonas_rare$samplesums +1) # add total coverage to Tolumonas df Tolumonas_samples <- c(as.character(df_Tolumonas$Sample)) ss_psdata_Tolumonas <- subset_samples(psdata_IVER_cleaned_noNA, Sample %in% Tolumonas_samples) df_Tolumonas$samplesums_total <- sample_sums(ss_psdata_Tolumonas) df_Tolumonas$ratio_ss <- df_Tolumonas$samplesums/df_Tolumonas$samplesums_total df_Tolumonas$log_ratio <- log(df_Tolumonas$samplesums + 1) - log(df_Tolumonas$samplesums_total + 1) #plot ggplot(df_Tolumonas, aes(x=Site, y=log_ratio, color = Site)) + geom_point(aes(), position = position_jitterdodge(1), alpha = 0.3, size = 2) + geom_boxplot(aes(), alpha = 0.75, outlier.shape = NULL) + scale_color_manual(values = colorset[c(3,2,4,11,9,6,28,13)]) + theme_classic() dev.off() #safe fig as jpeg ggsave(file="Ratio_Tolumonas_perSite_new.jpeg", width=5, height=3, dpi=300, bg="transparent") #### statistics target species ##### # adjust time df_Tolumonas$date <- paste0(df_Tolumonas$Year, "-",df_Tolumonas$Month,"-",df_Tolumonas$Day) df_Tolumonas$date <- as.Date(df_Tolumonas$date, format = "%Y-%m-%d") df_Tolumonas_rare$date <- paste0(df_Tolumonas_rare$Year, "-",df_Tolumonas_rare$Month,"-",df_Tolumonas_rare$Day) df_Tolumonas_rare$date <- as.Date(df_Tolumonas_rare$date, format = "%Y-%m-%d") # linear regression lm_Tolumonas <- lm(data=df_Tolumonas, log_ratio ~ date + Site) anova(lm_Tolumonas) summary(lm_Tolumonas) # model assumptions hist(resid(lm_Tolumonas)) plot(resid(lm_Tolumonas) ~ fitted(lm_Tolumonas)) library(PMCMR) attach(df_Tolumonas) ph_kwtest_Tolumonas <- posthoc.kruskal.dunn.test(log_ratio, Site, "BH") capture.output(ph_kwtest_Tolumonas, file = "Stats_Tolumonas_logRatio_Site.txt") detach(df_Tolumonas) # Tolumonas rarefied data # linear regression lm_Tolumonas_rare <- lm(data=df_Tolumonas_rare, log_samplesums_add1 ~ date + Site) anova(lm_Tolumonas_rare) summary(lm_Tolumonas_rare) # model assumptions hist(resid(lm_Tolumonas_rare)) plot(resid(lm_Tolumonas_rare) ~ fitted(lm_Tolumonas_rare)) library(PMCMR) attach(df_Tolumonas_rare) ph_kwtest_Tolumonas_rare <- posthoc.kruskal.dunn.test(log_samplesums_add1, Site, "BH") capture.output(ph_kwtest_Tolumonas_rare, file = "Stats_Tolumonas_logCount_rare18059_Site.txt") detach(df_Tolumonas_rare) kruskal_Tolumonas_rare <- kruskal.test(df_Tolumonas_rare$log_samplesums_add1, df_Tolumonas$Site) capture.output(kruskal_Tolumonas_rare, file = "kruskal_Tolumonas_rare.txt") ################################ R -- Plot stacked barplots of Phylum relative abundances ################################ ## Plot is manually adjusted and reformatted in Adobe Illustrator # hospital hospital <- subset_samples(psdata_IVER_cleaned_all, Site == "H") hospital_phylum <- tax_glom(hospital, taxrank = "Phylum") sample_data(hospital_phylum)$Month <- as.factor(as.character(sample_data(hospital_phylum)$Month)) hospital_phylum_merged <- merge_samples(hospital_phylum, "Month", fun = mean) unique(sample_names(hospital_phylum)) hospital_phylum_merged_rel <- transform_sample_counts(hospital_phylum_merged, function(x) x/sum(x)*100) H <- plot_ordered_bar_no_legend(hospital_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE) + scale_fill_manual(values = colorset) # nursing nursing <- subset_samples(psdata_IVER_cleaned_all, Site == "N") nursing_phylum <- tax_glom(nursing, taxrank = "Phylum") sample_data(nursing_phylum)$Month <- as.factor(as.character(sample_data(nursing_phylum)$Month)) nursing_phylum_merged <- merge_samples(nursing_phylum, "Month", fun = mean) unique(sample_names(nursing_phylum)) nursing_phylum_merged_rel <- transform_sample_counts(nursing_phylum_merged, function(x) x/sum(x)*100) N <- plot_ordered_bar_no_legend(nursing_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE) + scale_fill_manual(values = colorset) # community community <- subset_samples(psdata_IVER_cleaned_all, Site == "C") community_phylum <- tax_glom(community, taxrank = "Phylum") sample_data(community_phylum)$Month <- as.factor(as.character(sample_data(community_phylum)$Month)) community_phylum_merged <- merge_samples(community_phylum, "Month", fun = mean) unique(sample_names(community_phylum)) community_phylum_merged_rel <- transform_sample_counts(community_phylum_merged, function(x) x/sum(x)*100) C <- plot_ordered_bar_no_legend(community_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE)+ scale_fill_manual(values = colorset) # influent influent <- subset_samples(psdata_IVER_cleaned_all, Site == "I") influent_phylum <- tax_glom(influent, taxrank = "Phylum") sample_data(influent_phylum)$Month <- as.factor(as.character(sample_data(influent_phylum)$Month)) influent_phylum_merged <- merge_samples(influent_phylum, "Month", fun = mean) unique(sample_names(influent_phylum)) influent_phylum_merged_rel <- transform_sample_counts(influent_phylum_merged, function(x) x/sum(x)*100) I <- plot_ordered_bar_no_legend(influent_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE)+ scale_fill_manual(values = colorset) # effluent effluent <- subset_samples(psdata_IVER_cleaned_all, Site == "E") effluent_phylum <- tax_glom(effluent, taxrank = "Phylum") sample_data(effluent_phylum)$Month <- as.factor(as.character(sample_data(effluent_phylum)$Month)) effluent_phylum_merged <- merge_samples(effluent_phylum, "Month", fun = mean) unique(sample_names(effluent_phylum)) effluent_phylum_merged_rel <- transform_sample_counts(effluent_phylum_merged, function(x) x/sum(x)*100) E <- plot_ordered_bar_no_legend(effluent_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE)+ scale_fill_manual(values = colorset) # up getwd() up <- subset_samples(psdata_IVER_cleaned_all, Site == "up") write.table(as(sample_data(psdata_IVER_cleaned_all), "data.frame"), file= "check_datums.txt", sep = "\t", dec = ".") up_phylum <- tax_glom(up, taxrank = "Phylum") sample_data(up_phylum)$Month <- as.factor(as.character(sample_data(up_phylum)$Month)) up_phylum_merged <- merge_samples(up_phylum, "Month", fun = mean) unique(sample_names(up_phylum)) up_phylum_merged_rel <- transform_sample_counts(up_phylum_merged, function(x) x/sum(x)*100) UP <- plot_ordered_bar_no_legend(up_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE)+ scale_fill_manual(values = colorset) # down down <- subset_samples(psdata_IVER_cleaned_all, Site == "down") down_phylum <- tax_glom(down, taxrank = "Phylum") sample_data(down_phylum)$Month <- as.factor(as.character(sample_data(down_phylum)$Month)) down_phylum_merged <- merge_samples(down_phylum, "Month", fun = mean) unique(sample_names(down_phylum)) down_phylum_merged_rel <- transform_sample_counts(down_phylum_merged, function(x) x/sum(x)*100) DOWN <- plot_ordered_bar_no_legend(down_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE)+ scale_fill_manual(values = colorset) ## List of absolute physeqs with phylum count tax_glommed per sample type list_phylum_counts_IVER <- list(hospital_phylum_merged, nursing_phylum_merged, community_phylum_merged, influent_phylum_merged, effluent_phylum_merged, up_phylum_merged, down_phylum_merged, control_phylum_merged ) saveRDS(list_phylum_counts_IVER, "list_phylum_counts_IVER.Rds") # control control <- subset_samples(psdata_IVER_cleaned_all, Site == "control") control_phylum <- tax_glom(control, taxrank = "Phylum") sample_data(control_phylum)$Month <- as.factor(as.character(sample_data(control_phylum)$Month)) control_phylum_merged <- merge_samples(control_phylum, "Month", fun = mean) unique(sample_names(control_phylum)) control_phylum_merged_rel <- transform_sample_counts(control_phylum_merged, function(x) x/sum(x)*100) CONTROL <- plot_ordered_bar_no_legend(control_phylum_merged_rel, x="Month", fill = "Phylum") + guides(fill = FALSE) library(gridExtra) pdf("Stacked_barplot_Phyla_bySite3.pdf", useDingbats = F, width = 15, height = 9) grid.arrange(H,N,C,I,E,UP,DOWN,CONTROL, ncol = 4, nrow = 2) dev.off() ################################ R -- Alpha diversity ################################ #### IVER - Alpha diversity - rarefied at 18059 r/s # load packages library(phyloseq) library(ggplot2) library(gridExtra) ## alpha diversity # input data set.seed(711) psdata_IVER_cleaned_noNA_18059 <- rarefy_even_depth(psdata_IVER_cleaned_noNA, 18059, rngseed = 711) psdata_IVER_cleaned_noNA_18059 # estimate alpha diversity on rarefied data richness_IVER_cleaned_noNA_18059 <- estimate_richness(psdata_IVER_cleaned_noNA_18059) richness_IVER_cleaned_noNA_18059$Sample <- data.frame(sample_data(psdata_IVER_cleaned_noNA_18059))$Sample sample_data(psdata_IVER_cleaned_noNA_18059)$Site <- factor(sample_data(psdata_IVER_cleaned_noNA_18059)$Site, levels = c("H","N","C","I","E","up","down","control") ) richness_IVER_cleaned_noNA_18059 <- plyr::join(data.frame(sample_data(psdata_IVER_cleaned_noNA_18059)), richness_IVER_cleaned_noNA_18059, by = "Sample", type = "left") pdf("ASV_Richness_Shannon__AllSites_raref-18059.pdf", width = 9, height = 4, useDingbats = F) richness_IVER <- ggplot(data=richness_IVER_cleaned_noNA_18059, aes(x = Site,y = Observed, color = Site)) + geom_point(aes(y = Observed), alpha = 0.3, position = position_jitterdodge(2), size = 2) + scale_color_manual(values = c(colorset[c(3,2,4,11,9,6,28,13)])) + geom_boxplot(aes(), alpha = 0.7, outlier.shape = NA) + #ggtitle("Observed ASV richness --after subsampling at 18059 reads") + labs(y="ASV richness") shannon_IVER <- ggplot(data=richness_IVER_cleaned_noNA_18059, aes(x = Site, y = Shannon,color = Site)) + geom_point(aes(), alpha = 0.3, position = position_jitterdodge(2), size = 2) + scale_color_manual(values = c(colorset[c(3,2,4,11,9,6,28,13)])) + geom_boxplot(aes(),alpha = 0.7, outlier.shape = NA) + #ggtitle("ASV Shannon diversity --after subsampling at 18059 reads") + labs(y="Shannon diversity") gridExtra::grid.arrange(richness_IVER, shannon_IVER, ncol = 2) dev.off() # statistics for site difference # linear regression lm_IVER_sites_Observed <- lm(data=richness_IVER_cleaned_noNA_18059, Observed ~ Site) ANOVA_IVER_sites_Observed <- anova(lm_IVER_sites_Observed) capture.output(ANOVA_IVER_sites_Observed, file = "Stats_IVER_ANOVA_Richness18059_Site.txt") summary(lm_IVER_sites_Observed) # model assumptions hist(resid(lm_IVER_sites_Observed)) plot(resid(lm_IVER_sites_Observed) ~ fitted(lm_IVER_sites_Observed)) # normality and homogeneity of variances look ok # TukeyHSD for pairwise comparisons AOV_IVER_sites_Observed <- aov(richness_IVER_cleaned_noNA_18059$Observed ~ richness_IVER_cleaned_noNA_18059$Site) Tukey_richness_IVER_18058 <- TukeyHSD(x=AOV_IVER_sites_Observed, 'richness_IVER_cleaned_noNA_18059$Site', conf.level=0.95, ordered = F) capture.output(Tukey_richness_IVER_18058, file = "Stats_IVER_pairwise_TukeyHSD_Richness18059_Site.txt") # SHannon # statistics for site difference # linear regression lm_IVER_sites_Shannon <- lm(data=richness_IVER_cleaned_noNA_18059, Shannon ~ Site) ANOVA_IVER_sites_Shannon <- anova(lm_IVER_sites_Shannon) capture.output(ANOVA_IVER_sites_Shannon, file = "Stats_IVER_ANOVA_Shannon18059_Site.txt") summary(lm_IVER_sites_Shannon) # model assumptions hist(resid(lm_IVER_sites_Shannon)) plot(resid(lm_IVER_sites_Shannon) ~ fitted(lm_IVER_sites_Shannon)) # TukeyHSD for pairwise comparisons AOV_IVER_sites_Shannon <- aov(richness_IVER_cleaned_noNA_18059$Shannon ~ richness_IVER_cleaned_noNA_18059$Site) Tukey_Shannon_IVER_18059 <- TukeyHSD(x=AOV_IVER_sites_Shannon, 'richness_IVER_cleaned_noNA_18059$Site', conf.level=0.95, ordered = F) capture.output(Tukey_Shannon_IVER_18059, file = "Stats_IVER_pairwise_TukeyHSD_Shannon18059_Site.txt") ### Analysis of log-ratios Plylum-X vs Bacteroidetes ## PVEE ## 20200121 #Planctomyctes vs Bacteroidetes # load example data frame for LN(Planctomycetes/Bacteroidetes) # create input excel files by exporting from: # open baseline-diff.qzv on view.qiime2.org # setwd by 'Session --> Set workdirectory --> choose directory' to folder with QIIME2 output files data <- read.table("Planctomycetes_VS_Bacteroidetes_x_Temperature_y_LN-LogRatio_sample_plot_data.tsv", sep = "\t", dec = ".", header = T) # prepare input data head(data) View(data) data$Current_Natural_Log_Ratio <- as.numeric(as.character(data$Current_Natural_Log_Ratio)) str(data) # build linear model (regression) with interaction between temperature and site lm_Planctomycetes_Bacteroidetes_Temp <- lm(data=data, Current_Natural_Log_Ratio ~ Temperature * Site) anova_Planctomycetes_VS_Bacteroidetes <- anova(lm_Planctomycetes_Bacteroidetes_Temp) capture.output(anova_Planctomycetes_VS_Bacteroidetes, "anova_Planctomycetes_VS_Bacteroidetes.txt") ## If interaction is not significant, then remove Site from model lm_Planctomycetes_Bacteroidetes_Temp2 <- lm(data=data, Current_Natural_Log_Ratio ~ Temperature) anova(lm_Planctomycetes_Bacteroidetes_Temp2) ## If interaction is significant, find temperature effects for every site separately. BiocManager::install("phia") library(phia) Site_interaction_Planctomycetes_VS_Bacteroidetes <- testInteractions(lm_Planctomycetes_Bacteroidetes_Temp, fixed = "Site", slope = "Temperature", adjustment = "BH") Site_interaction_Planctomycetes_VS_Bacteroidetes capture.output(Site_interaction_Planctomycetes_VS_Bacteroidetes, "Site_interaction_Planctomycetes_VS_Bacteroidetes.txt") library(ggplot2) ggplot(data=data, aes(x=Temperature, y=Current_Natural_Log_Ratio, color = Site)) + geom_point() #repeat with other Phylum vs bacteroidetes ### fi ###