1
votes

I have a Phyloseq object with my OTU table and TAX table. I would like to create a bar plot, at for instance family level, but families belonging to the same Phylum will be displayed with the same colour and be distinguished by a gradient of this color.

The final result should be similar to this:

Good Bar plot

I converted my phyloseq object into a dataframe using psmelt() and tried to adapt the code from this post : Stacked barplot with colour gradients for each bar

But I'm currently unable to create a correct graph.

library(phyloseq)
library(ggplot2)

df <- psmelt(GlobalPatterns)
df$group <- paste0(df$Phylum, "-", df$Family, sep = "")
colours <-ColourPalleteMulti(df, "Phylum", "Family")
ggplot(df, aes(Sample)) + 
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual("Subject", values=colours, guide = "none")

Erreur : Insufficient values in manual scale. 395 needed but only 334 provided.

Thank you in advance for any help !

Edit: here the dput of the data

    dput(head(df, 10))
structure(list(OTU = c("549656", "279599", "549656", "549656", 
"360229", "331820", "94166", "331820", "329744", "189047"), Sample = c("AQC4cm", 
"LMEpi24M", "AQC7cm", "AQC1cm", "M31Tong", "M11Fcsw", "M31Tong", 
"M31Fcsw", "SLEpi20M", "TS29"), Abundance = c(1177685, 914209, 
711043, 554198, 540850, 452219, 396201, 354695, 323914, 251215
), X.SampleID = structure(c(2L, 10L, 3L, 1L, 16L, 11L, 16L, 14L, 
20L, 26L), .Label = c("AQC1cm", "AQC4cm", "AQC7cm", "CC1", "CL3", 
"Even1", "Even2", "Even3", "F21Plmr", "LMEpi24M", "M11Fcsw", 
"M11Plmr", "M11Tong", "M31Fcsw", "M31Plmr", "M31Tong", "NP2", 
"NP3", "NP5", "SLEpi20M", "SV1", "TRRsed1", "TRRsed2", "TRRsed3", 
"TS28", "TS29"), class = "factor"), Primer = structure(c(14L, 
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("ILBC_01", 
"ILBC_02", "ILBC_03", "ILBC_04", "ILBC_05", "ILBC_07", "ILBC_08", 
"ILBC_09", "ILBC_10", "ILBC_11", "ILBC_13", "ILBC_15", "ILBC_16", 
"ILBC_17", "ILBC_18", "ILBC_19", "ILBC_20", "ILBC_21", "ILBC_22", 
"ILBC_23", "ILBC_24", "ILBC_25", "ILBC_26", "ILBC_27", "ILBC_28", 
"ILBC_29"), class = "factor"), Final_Barcode = structure(c(14L, 
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("AACGCA", 
"AACTCG", "AACTGT", "AAGAGA", "AAGCTG", "AATCGT", "ACACAC", "ACACAT", 
"ACACGA", "ACACGG", "ACACTG", "ACAGAG", "ACAGCA", "ACAGCT", "ACAGTG", 
"ACAGTT", "ACATCA", "ACATGA", "ACATGT", "ACATTC", "ACCACA", "ACCAGA", 
"ACCAGC", "ACCGCA", "ACCTCG", "ACCTGT"), class = "factor"), Barcode_truncated_plus_T = structure(c(6L, 
10L, 8L, 25L, 19L, 9L, 19L, 20L, 14L, 16L), .Label = c("AACTGT", 
"ACAGGT", "ACAGTT", "ACATGT", "ACGATT", "AGCTGT", "ATGTGT", "CACTGT", 
"CAGCTT", "CAGTGT", "CCGTGT", "CGAGGT", "CGAGTT", "CTCTGT", "GAATGT", 
"GCTGGT", "GTGTGT", "TCATGT", "TCGTGT", "TCTCTT", "TCTGGT", "TGATGT", 
"TGCGGT", "TGCGTT", "TGCTGT", "TGTGGT"), class = "factor"), Barcode_full_length = structure(c(4L, 
7L, 3L, 13L, 26L, 8L, 26L, 21L, 2L, 11L), .Label = c("AGAGAGACAGG", 
"AGCCGACTCTG", "ATGAAGCACTG", "CAAGCTAGCTG", "CACGTGACATG", "CATCGACGAGT", 
"CATGAACAGTG", "CGACTGCAGCT", "CGAGTCACGAT", "CTAGCGTGCGT", "CTAGTCGCTGG", 
"GAACGATCATG", "GACCACTGCTG", "GATGTATGTGG", "GCATCGTCTGG", "GCCATAGTGTG", 
"GCTAAGTGATG", "GTACGCACAGT", "GTAGACATGTG", "TAGACACCGTG", "TCGACATCTCT", 
"TCGCGCAACTG", "TCTGATCGAGG", "TGACTCTGCGG", "TGCGCTGAATG", "TGTGGCTCGTG"
), class = "factor"), SampleType = structure(c(3L, 2L, 3L, 3L, 
9L, 1L, 9L, 1L, 2L, 1L), .Label = c("Feces", "Freshwater", "Freshwater (creek)", 
"Mock", "Ocean", "Sediment (estuary)", "Skin", "Soil", "Tongue"
), class = "factor"), Description = structure(c(2L, 10L, 3L, 
1L, 16L, 11L, 16L, 14L, 21L, 25L), .Label = c("Allequash Creek, 0-1cm depth", 
"Allequash Creek, 3-4 cm depth", "Allequash Creek, 6-7 cm depth", 
"Calhoun South Carolina Pine soil, pH 4.9", "Cedar Creek Minnesota, grassland, pH 6.1", 
"Even1", "Even2", "Even3", "F1, Day 1,  right palm, whole body study ", 
"Lake Mendota Minnesota, 24 meter epilimnion ", "M1, Day 1, fecal swab, whole body study ", 
"M1, Day 1, right palm, whole body study ", "M1, Day 1, tongue, whole body study ", 
"M3, Day 1, fecal swab, whole body study", "M3, Day 1, right palm, whole body study", 
"M3, Day 1, tongue, whole body study ", "Newport Pier, CA surface water, Time 1", 
"Newport Pier, CA surface water, Time 2", "Newport Pier, CA surface water, Time 3", 
"Sevilleta new Mexico, desert scrub, pH 8.3", "Sparkling Lake Wisconsin, 20 meter eplimnion", 
"Tijuana River Reserve, depth 1", "Tijuana River Reserve, depth 2", 
"Twin #1", "Twin #2"), class = "factor"), Kingdom = c("Bacteria", 
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", 
"Bacteria", "Bacteria", "Bacteria"), Phylum = c("Cyanobacteria", 
"Cyanobacteria", "Cyanobacteria", "Cyanobacteria", "Proteobacteria", 
"Bacteroidetes", "Proteobacteria", "Bacteroidetes", "Actinobacteria", 
"Firmicutes"), Class = c("Chloroplast", "Nostocophycideae", "Chloroplast", 
"Chloroplast", "Betaproteobacteria", "Bacteroidia", "Gammaproteobacteria", 
"Bacteroidia", "Actinobacteria", "Clostridia"), Order = c("Stramenopiles", 
"Nostocales", "Stramenopiles", "Stramenopiles", "Neisseriales", 
"Bacteroidales", "Pasteurellales", "Bacteroidales", "Actinomycetales", 
"Clostridiales"), Family = c(NA, "Nostocaceae", NA, NA, "Neisseriaceae", 
"Bacteroidaceae", "Pasteurellaceae", "Bacteroidaceae", "ACK-M1", 
"Ruminococcaceae"), Genus = c(NA, "Dolichospermum", NA, NA, "Neisseria", 
"Bacteroides", "Haemophilus", "Bacteroides", NA, NA), Species = c(NA, 
NA, NA, NA, NA, NA, "Haemophilusparainfluenzae", NA, NA, NA), 
    group = c("Cyanobacteria-NA", "Cyanobacteria-Nostocaceae", 
    "Cyanobacteria-NA", "Cyanobacteria-NA", "Proteobacteria-Neisseriaceae", 
    "Bacteroidetes-Bacteroidaceae", "Proteobacteria-Pasteurellaceae", 
    "Bacteroidetes-Bacteroidaceae", "Actinobacteria-ACK-M1", 
    "Firmicutes-Ruminococcaceae"), group = c("Cyanobacteria-NA", 
    "Cyanobacteria-Nostocaceae", "Cyanobacteria-NA", "Cyanobacteria-NA", 
    "Proteobacteria-Neisseriaceae", "Bacteroidetes-Bacteroidaceae", 
    "Proteobacteria-Pasteurellaceae", "Bacteroidetes-Bacteroidaceae", 
    "Actinobacteria-ACK-M1", "Firmicutes-Ruminococcaceae")), row.names = c(406582L, 
241435L, 406580L, 406574L, 329873L, 300794L, 494797L, 300772L, 
298689L, 114279L), class = "data.frame")

Edit 2: We are on the good way

So, your code seems to work perfectly in term of color but I have some doubts about the values of the bar plot (the percentage for each family).

I plotted a proportional bar plot of the data with this code:

GlobalPatterns_prop = transform_sample_counts(GlobalPatterns, function(x)       100 * x/sum(x)) 
plot_bar(GlobalPatterns_prop , fill = "Phylum")

and obtained this :

GlobalPatterns_proportional_Phylum

If I understand well, using your method a majority of phylum and bar height should be "Others". I did the same with my data and I clearly see a difference in Phylum proportional abundance.

I have for the moment no clue on what is happening...

2
Welcome to SO! Please provide your df using dput(df) or dput(head(df, 10)) if your df has many rows.kmacierzanka

2 Answers

1
votes

There's a few steps involved.

First, define the "Others".

phylums <- c('Proteobacteria','Bacteroidetes','Firmicutes')

df$Phylum[!df$Phylum %in% phylums] <- "Others"
df$Family[!df$Phylum %in% phylums] <- "Others"

df$Family[df$Phylum=="Proteobacteria" & 
 !df$Family %in% c('Alcaligenaceae','Enterobacteriaceae')] <- "Other Protobacteria"

df$Family[df$Phylum=="Bacteroidetes" &
 !df$Family %in% c('Bacteroidaceae','Rikenellaceae','Porphyromonadaceae')] <- "Other Bacteroidetes"

df$Family[df$Phylum=="Firmicutes" & 
 !df$Family %in% c('Lactobacillaceae','Clostridiaceae','Ruminococcaceae','Lachnospiraceae')] <- "Other Firmicutes"

Then, convert Phylum to a factor so that (1) the "Others" are placed last in the legend and (2) we can reorder the Family variable based on the underlying factor levels of Phylum and whether Family contains "Others". This ensures the colour gradients are correctly assigned.

library(forcats)
library(dplyr)
df2 <- select(df, Sample, Phylum, Family) %>%
  mutate(Phylum=factor(Phylum, levels=c(phylums, "Others")),
         Family=fct_reorder(Family, 10*as.integer(Phylum) + grepl("Others", Family))) %>%

  group_by(Family) %>%  # For this dataset only
  sample_n(100)         # Otherwise, unnecessary

The last two lines are extra that's not needed for real data, but here I've selected a sample of 100 within each Family so that the graph looks prettier. Otherwise, there are too many "Others" and in the graph, they swamp the others.

The custom function to create the colour gradients can be found in the accepted answer to this question (as you mentioned).

colours <- ColourPalleteMulti(df2, "Phylum", "Family")

Finally, instead of your group variable, we can use the Family variable so that the labelling is concise.

library(ggplot2)
ggplot(df2, aes(x=Sample, fill = Family)) + 
  geom_bar(position="fill", colour = "grey") +  # Stacked 100% barplot
  scale_fill_manual("", values=colours) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5)) +  # Vertical x-axis tick labels
  scale_y_continuous(labels = scales::percent_format()) +
  labs(y="Relative abundance")

enter image description here

I couldn't manage to add the Phylum labels on the right of the legend. Perhaps you can add them manually.

0
votes

Great question and I'm really happy that there is solution to the two level coloring, great work Edward!

To add to the annotation part of your question. As a work around; you can make a seperate ggplot figure that shows the legend color and right annotations. Looking at the example figure showed I got quite close. I took this from this link.

https://coderedirect.com/questions/217402/add-annotation-and-segments-to-groups-of-legend-elements

First you want to make a dataframe listening alll your Taxonomic levels below each other. We are going to create concise x and y coordinates for both taxonomic levels and the 'Phyla brackets'. First arrange the right order and coordinates for the Family level.

coord_fam = df %>% select(Phylum, Family) %>% unique(
)  %>% ungroup()%>%mutate(x= c(rep(1,nrow(.))), y=1:nrow(.))

Now we want to calculate the top, middle and bottom of each group, so we can add the Phylum names and the Phylan brackets.

coord_phylum = coord_fam %>% group_by(Phylum) %>% summarise(x=mean(x),ymid= mean(y),
                                                           ymin=min(y), ymax=max(y))

Last you want to plot the coordinates correctly.

v=0.3
p2 = coord_fam %>% ggplot()+
  geom_point(aes(0.05,y, col= Family), size=8 )+
  scale_x_continuous(limits = c(0, 2)) +
  geom_segment(data = coord_phylum,
               aes(x = x + 0.1, xend = x + v, y= ymax, yend=ymax), col="black")+
  
  geom_segment(data = coord_phylum,
               aes(x = x + 0.1, xend = x + v, y= ymin, yend=ymin))+
  
  geom_segment(data = coord_phylum,
               aes(x = x + v, xend = x + v, y= ymin, yend=ymax))+
  
  geom_text(data = coord_phylum, aes(x = x + v+0.5, y = ymid, label = Phylum)) +
  geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family, col=Family))+
  geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family), alpha=0.9,col="grey50")+
  scale_colour_manual(values = colours)+
  theme_void()+theme(legend.position = "none")+ 
  scale_y_reverse()

p2

V is used to determine the length of the brackets.

When you put patch this together with the barplot, it can be a bit of a puzzle to find the right size for all of the geom_sizes, so start off small.

library(patchwork)
(p1+p1)

I hope this helps! You've probably already published your data by now, but maybe for the next manuscript.

Happy science, y'all!