I have a Phyloseq object with my OTU table and TAX table. I would like to create a bar plot, at for instance family level, but families belonging to the same Phylum will be displayed with the same colour and be distinguished by a gradient of this color.
The final result should be similar to this:
I converted my phyloseq object into a dataframe using psmelt()
and tried to adapt the code from this post : Stacked barplot with colour gradients for each bar
But I'm currently unable to create a correct graph.
library(phyloseq)
library(ggplot2)
df <- psmelt(GlobalPatterns)
df$group <- paste0(df$Phylum, "-", df$Family, sep = "")
colours <-ColourPalleteMulti(df, "Phylum", "Family")
ggplot(df, aes(Sample)) +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual("Subject", values=colours, guide = "none")
Erreur : Insufficient values in manual scale. 395 needed but only 334 provided.
Thank you in advance for any help !
Edit: here the dput of the data
dput(head(df, 10))
structure(list(OTU = c("549656", "279599", "549656", "549656",
"360229", "331820", "94166", "331820", "329744", "189047"), Sample = c("AQC4cm",
"LMEpi24M", "AQC7cm", "AQC1cm", "M31Tong", "M11Fcsw", "M31Tong",
"M31Fcsw", "SLEpi20M", "TS29"), Abundance = c(1177685, 914209,
711043, 554198, 540850, 452219, 396201, 354695, 323914, 251215
), X.SampleID = structure(c(2L, 10L, 3L, 1L, 16L, 11L, 16L, 14L,
20L, 26L), .Label = c("AQC1cm", "AQC4cm", "AQC7cm", "CC1", "CL3",
"Even1", "Even2", "Even3", "F21Plmr", "LMEpi24M", "M11Fcsw",
"M11Plmr", "M11Tong", "M31Fcsw", "M31Plmr", "M31Tong", "NP2",
"NP3", "NP5", "SLEpi20M", "SV1", "TRRsed1", "TRRsed2", "TRRsed3",
"TS28", "TS29"), class = "factor"), Primer = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("ILBC_01",
"ILBC_02", "ILBC_03", "ILBC_04", "ILBC_05", "ILBC_07", "ILBC_08",
"ILBC_09", "ILBC_10", "ILBC_11", "ILBC_13", "ILBC_15", "ILBC_16",
"ILBC_17", "ILBC_18", "ILBC_19", "ILBC_20", "ILBC_21", "ILBC_22",
"ILBC_23", "ILBC_24", "ILBC_25", "ILBC_26", "ILBC_27", "ILBC_28",
"ILBC_29"), class = "factor"), Final_Barcode = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("AACGCA",
"AACTCG", "AACTGT", "AAGAGA", "AAGCTG", "AATCGT", "ACACAC", "ACACAT",
"ACACGA", "ACACGG", "ACACTG", "ACAGAG", "ACAGCA", "ACAGCT", "ACAGTG",
"ACAGTT", "ACATCA", "ACATGA", "ACATGT", "ACATTC", "ACCACA", "ACCAGA",
"ACCAGC", "ACCGCA", "ACCTCG", "ACCTGT"), class = "factor"), Barcode_truncated_plus_T = structure(c(6L,
10L, 8L, 25L, 19L, 9L, 19L, 20L, 14L, 16L), .Label = c("AACTGT",
"ACAGGT", "ACAGTT", "ACATGT", "ACGATT", "AGCTGT", "ATGTGT", "CACTGT",
"CAGCTT", "CAGTGT", "CCGTGT", "CGAGGT", "CGAGTT", "CTCTGT", "GAATGT",
"GCTGGT", "GTGTGT", "TCATGT", "TCGTGT", "TCTCTT", "TCTGGT", "TGATGT",
"TGCGGT", "TGCGTT", "TGCTGT", "TGTGGT"), class = "factor"), Barcode_full_length = structure(c(4L,
7L, 3L, 13L, 26L, 8L, 26L, 21L, 2L, 11L), .Label = c("AGAGAGACAGG",
"AGCCGACTCTG", "ATGAAGCACTG", "CAAGCTAGCTG", "CACGTGACATG", "CATCGACGAGT",
"CATGAACAGTG", "CGACTGCAGCT", "CGAGTCACGAT", "CTAGCGTGCGT", "CTAGTCGCTGG",
"GAACGATCATG", "GACCACTGCTG", "GATGTATGTGG", "GCATCGTCTGG", "GCCATAGTGTG",
"GCTAAGTGATG", "GTACGCACAGT", "GTAGACATGTG", "TAGACACCGTG", "TCGACATCTCT",
"TCGCGCAACTG", "TCTGATCGAGG", "TGACTCTGCGG", "TGCGCTGAATG", "TGTGGCTCGTG"
), class = "factor"), SampleType = structure(c(3L, 2L, 3L, 3L,
9L, 1L, 9L, 1L, 2L, 1L), .Label = c("Feces", "Freshwater", "Freshwater (creek)",
"Mock", "Ocean", "Sediment (estuary)", "Skin", "Soil", "Tongue"
), class = "factor"), Description = structure(c(2L, 10L, 3L,
1L, 16L, 11L, 16L, 14L, 21L, 25L), .Label = c("Allequash Creek, 0-1cm depth",
"Allequash Creek, 3-4 cm depth", "Allequash Creek, 6-7 cm depth",
"Calhoun South Carolina Pine soil, pH 4.9", "Cedar Creek Minnesota, grassland, pH 6.1",
"Even1", "Even2", "Even3", "F1, Day 1, right palm, whole body study ",
"Lake Mendota Minnesota, 24 meter epilimnion ", "M1, Day 1, fecal swab, whole body study ",
"M1, Day 1, right palm, whole body study ", "M1, Day 1, tongue, whole body study ",
"M3, Day 1, fecal swab, whole body study", "M3, Day 1, right palm, whole body study",
"M3, Day 1, tongue, whole body study ", "Newport Pier, CA surface water, Time 1",
"Newport Pier, CA surface water, Time 2", "Newport Pier, CA surface water, Time 3",
"Sevilleta new Mexico, desert scrub, pH 8.3", "Sparkling Lake Wisconsin, 20 meter eplimnion",
"Tijuana River Reserve, depth 1", "Tijuana River Reserve, depth 2",
"Twin #1", "Twin #2"), class = "factor"), Kingdom = c("Bacteria",
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
"Bacteria", "Bacteria", "Bacteria"), Phylum = c("Cyanobacteria",
"Cyanobacteria", "Cyanobacteria", "Cyanobacteria", "Proteobacteria",
"Bacteroidetes", "Proteobacteria", "Bacteroidetes", "Actinobacteria",
"Firmicutes"), Class = c("Chloroplast", "Nostocophycideae", "Chloroplast",
"Chloroplast", "Betaproteobacteria", "Bacteroidia", "Gammaproteobacteria",
"Bacteroidia", "Actinobacteria", "Clostridia"), Order = c("Stramenopiles",
"Nostocales", "Stramenopiles", "Stramenopiles", "Neisseriales",
"Bacteroidales", "Pasteurellales", "Bacteroidales", "Actinomycetales",
"Clostridiales"), Family = c(NA, "Nostocaceae", NA, NA, "Neisseriaceae",
"Bacteroidaceae", "Pasteurellaceae", "Bacteroidaceae", "ACK-M1",
"Ruminococcaceae"), Genus = c(NA, "Dolichospermum", NA, NA, "Neisseria",
"Bacteroides", "Haemophilus", "Bacteroides", NA, NA), Species = c(NA,
NA, NA, NA, NA, NA, "Haemophilusparainfluenzae", NA, NA, NA),
group = c("Cyanobacteria-NA", "Cyanobacteria-Nostocaceae",
"Cyanobacteria-NA", "Cyanobacteria-NA", "Proteobacteria-Neisseriaceae",
"Bacteroidetes-Bacteroidaceae", "Proteobacteria-Pasteurellaceae",
"Bacteroidetes-Bacteroidaceae", "Actinobacteria-ACK-M1",
"Firmicutes-Ruminococcaceae"), group = c("Cyanobacteria-NA",
"Cyanobacteria-Nostocaceae", "Cyanobacteria-NA", "Cyanobacteria-NA",
"Proteobacteria-Neisseriaceae", "Bacteroidetes-Bacteroidaceae",
"Proteobacteria-Pasteurellaceae", "Bacteroidetes-Bacteroidaceae",
"Actinobacteria-ACK-M1", "Firmicutes-Ruminococcaceae")), row.names = c(406582L,
241435L, 406580L, 406574L, 329873L, 300794L, 494797L, 300772L,
298689L, 114279L), class = "data.frame")
Edit 2: We are on the good way
So, your code seems to work perfectly in term of color but I have some doubts about the values of the bar plot (the percentage for each family).
I plotted a proportional bar plot of the data with this code:
GlobalPatterns_prop = transform_sample_counts(GlobalPatterns, function(x) 100 * x/sum(x))
plot_bar(GlobalPatterns_prop , fill = "Phylum")
and obtained this :
If I understand well, using your method a majority of phylum and bar height should be "Others". I did the same with my data and I clearly see a difference in Phylum proportional abundance.
I have for the moment no clue on what is happening...
df
usingdput(df)
ordput(head(df, 10))
if yourdf
has many rows. – kmacierzanka