Title: | Quality Control for RNA-Seq Data |
---|---|
Description: | Functions for semi-automated quality control of bulk RNA-seq data. |
Authors: | Frederik Ziebell [cre],
Frederik Ziebell [aut] |
Maintainer: | Frederik Ziebell <[email protected]> |
License: | Apache License (>= 2) |
Version: | 0.2.1 |
Built: | 2025-02-10 06:02:44 UTC |
Source: | https://github.com/frederikziebell/rnaseqqc |
for a character vector x, check if all non-NA elements of x can be converted to numeric
all_numeric(x)
all_numeric(x)
x |
A character vector |
Filter genes with low counts
filter_genes(dds, min_count = 5, min_rep = 3)
filter_genes(dds, min_count = 5, min_rep = 3)
dds |
A DESeqDataSet |
min_count , min_rep
|
keep genes with at least min_count counts in at least min_rep replicates |
A DESeq2::DESeqDataSet
object with only those genes that meet the filter criteria.
library("DESeq2") dds <- makeExampleDESeqDataSet() filter_genes(dds)
library("DESeq2") dds <- makeExampleDESeqDataSet() filter_genes(dds)
Get all gene IDs in a DESeqDataSet for a given gene name.
get_gene_id(gene_name, dds)
get_gene_id(gene_name, dds)
gene_name |
A gene name |
dds |
A DESeqDataSet |
A character vector
get_gene_id("HBA1", T47D)
get_gene_id("HBA1", T47D)
Make DESeqDataSet from counts matrix and metadata
make_dds(counts, metadata, ah_record, design = ~1)
make_dds(counts, metadata, ah_record, design = ~1)
counts |
The genes x samples counts matrix with row names. At least one row name must be an ENSEMBL gene ID, since gene annotation is done via the ENSEMBL database. |
metadata |
data.frame of sample information. Order of rows corresponds to the order of columns in the counts matrix. |
ah_record |
ID of AnnotationHub record used to retrieve an |
design |
The design formula specified in library(AnnotationHub) mcols(AnnotationHub()) %>% as_tibble(rownames="ah_record") %>% filter(rdataclass=="EnsDb") |
A DESeq2::DESeqDataSet
object containing the counts matrix and metadata.
library("DESeq2") count_mat <- counts(T47D) meta <- data.frame(colData(T47D)) dds <- make_dds(counts = count_mat, metadata = meta, ah_record = "AH89426")
library("DESeq2") count_mat <- counts(T47D) meta <- data.frame(colData(T47D)) dds <- make_dds(counts = count_mat, metadata = meta, ah_record = "AH89426")
Create a mean-sd plot Make a scatterplot that shows for each gene its standard deviation versus mean.
mean_sd_plot(vsd)
mean_sd_plot(vsd)
vsd |
A DESeqTransform object |
A ggplot object of the ggplot2 package that contains the mean-sd plot.
library("DESeq2") dds <- makeExampleDESeqDataSet(interceptMean=10, n=5000) vsd <- vst(dds) mean_sd_plot(vsd)
library("DESeq2") dds <- makeExampleDESeqDataSet(interceptMean=10, n=5000) vsd <- vst(dds) mean_sd_plot(vsd)
Plot the total number of counts for each sample and the major classes of ENSEMBL gene biotypes (protein coding, lncRNA, etc.)
plot_biotypes(dds)
plot_biotypes(dds)
dds |
A DESeqDataSet |
A ggplot object of the ggplot2 package.
plot_biotypes(T47D)
plot_biotypes(T47D)
Plot gene expression along a chromosome
plot_chromosome(vsd, chr, scale = FALSE, trunc_val = NULL)
plot_chromosome(vsd, chr, scale = FALSE, trunc_val = NULL)
vsd |
An object generated by |
chr |
A string denoting a chromosome as annotated by ENSEMBL, e.g. '1', '2', 'X', 'Y', 'MT' |
scale |
Whether to scale the columns of the heatmap |
trunc_val |
Truncate the expression matrix to this value prior to plotting. This is useful if some very high expression values dominate the heatmap. By default, the heatmap is truncated to expression values at most 3 standard deviations from the mean. |
A Heatmap-class object of the ComplexHeatmap
package that contains the heatmap of expression values.
library("DESeq2") chr1 <- T47D[which(mcols(T47D)$chromosome=="1"),] vsd <- vst(chr1) plot_chromosome(vsd, chr="1")
library("DESeq2") chr1 <- T47D[which(mcols(T47D)$chromosome=="1"),] vsd <- vst(chr1) plot_chromosome(vsd, chr="1")
Plot a gene
plot_gene( gene, dds, x_var = NULL, color_by = NULL, point_alpha = 0.7, point_rel_size = 2, show_plot = TRUE )
plot_gene( gene, dds, x_var = NULL, color_by = NULL, point_alpha = 0.7, point_rel_size = 2, show_plot = TRUE )
gene |
A gene ID or gene name, i.e. an element of |
dds |
a DESeqDataSet |
x_var |
Variable to plot on the x-axis. If NULL, then each sample is plotted separately. |
color_by |
Variable (column in |
point_alpha |
alpha value of |
point_rel_size |
relative size of |
show_plot |
Whether to show the plot or not |
The function displays the plot and returns invisible the data frame of expression values and colData annotation for the gene.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() colData(dds)$type <- c("A","A","A","B","B","B") colData(dds)$patient <- c("1","1","2","2","3","3") dds <- estimateSizeFactors(dds) plot_gene("gene1", dds) plot_gene("gene1", dds, x_var="patient", color_by="type")
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() colData(dds)$type <- c("A","A","A","B","B","B") colData(dds)$patient <- c("1","1","2","2","3","3") dds <- estimateSizeFactors(dds) plot_gene("gene1", dds) plot_gene("gene1", dds, x_var="patient", color_by="type")
For specified thresholds, the number of detected genes is shown for each sample.
plot_gene_detection(dds, thresholds = c(3, 10, 20, 50))
plot_gene_detection(dds, thresholds = c(3, 10, 20, 50))
dds |
A DESeqDataSet |
thresholds |
Vector of thresholds for which the number of genes with counts greater or equal than the thresholds is plotted |
A ggplot object of the ggplot2 package that contains the gene detection plot.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() plot_gene_detection(dds)
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() plot_gene_detection(dds)
Plot per sample the fraction of genes, versus the fraction of total counts.
plot_library_complexity(dds, show_progress = TRUE)
plot_library_complexity(dds, show_progress = TRUE)
dds |
A DESeqDataSet |
show_progress |
Whether to show a progress bar of the computation. |
A ggplot object of the ggplot2 package that contains the library complexity plot.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() plot_library_complexity(dds)
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet() plot_library_complexity(dds)
Plot loadings of a principal component
plot_loadings( pca_res, PC = 1, square = FALSE, color_by = NULL, annotate_top_n = 0, highlight_genes = NULL, show_plot = TRUE )
plot_loadings( pca_res, PC = 1, square = FALSE, color_by = NULL, annotate_top_n = 0, highlight_genes = NULL, show_plot = TRUE )
pca_res |
A result returned from |
PC |
Number of the principal component to plot |
square |
Whether to plot squared loadings. The squared loading is equal to the fraction of variance explained by the respective feature in the given principal component. |
color_by |
Variable (column in |
annotate_top_n |
Annotate the top n features with positive or negative loading |
highlight_genes |
Vector of gene names or gene IDs to highlight on the plot (overwrites top_n annotation) |
show_plot |
Whether to show the plot |
The function displays the loadings plot and returns invisible a list of the plot, the data.frame of the PCA loadings.
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) pca_res <- plot_pca(data) plot_loadings(pca_res)
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) pca_res <- plot_pca(data) plot_loadings(pca_res)
MA-plot of a differential testing result
plot_ma(de_res, dds, annotate_top_n = 5, highlight_genes = NULL)
plot_ma(de_res, dds, annotate_top_n = 5, highlight_genes = NULL)
de_res |
An object returned by |
dds |
The DESeqDataSet that was used to build the ´de_res´ object. This is needed for gene name annotation. |
annotate_top_n |
Annotate the top n significant genes by fold change (up- and down-regulated) |
highlight_genes |
Vector of gene names or gene IDs to highlight on the plot (overwrites top_n annotation) |
A ggplot object of the ggplot2 package that contains the MA-plot. The plot shows three classes of points: Light gray points are genes with low counts that are removed from the analysis by independent filtering. Darker gray points are not significant genes that show a density map to visualize where the majority of non-significant points are located. Finally, red point show significant genes.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1500, m=6, betaSD=.3, interceptMean=6) rowData(dds)$gene_name <- rownames(dds) dds <- DESeq(dds) de_res <- results(dds) plot_ma(de_res, dds)
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1500, m=6, betaSD=.3, interceptMean=6) rowData(dds)$gene_name <- rownames(dds) dds <- DESeq(dds) de_res <- results(dds) plot_ma(de_res, dds)
Plot results of a principal component analysis
plot_pca( obj, PC_x = 1, PC_y = 2, n_feats = 500, scale_feats = FALSE, na_frac = 0.3, metadata = NULL, color_by = NULL, shape_by = NULL, point_alpha = 0.7, point_rel_size = 2, show_plot = TRUE, rasterise = FALSE, ... )
plot_pca( obj, PC_x = 1, PC_y = 2, n_feats = 500, scale_feats = FALSE, na_frac = 0.3, metadata = NULL, color_by = NULL, shape_by = NULL, point_alpha = 0.7, point_rel_size = 2, show_plot = TRUE, rasterise = FALSE, ... )
obj |
A (features x samples) matrix or SummarizedExperiment object |
PC_x |
The PC to show on the x-axis. |
PC_y |
The PC to show on the y-axis. |
n_feats |
Number of top-variable features to include. |
scale_feats |
Whether to scale the features. |
na_frac |
Only consider features with the stated maximum fraction of NAs or NaNs. NA/NaNs will be mean-imputed for PCA. |
metadata |
A data.frame used for annotating samples. |
color_by |
Variable by which to color points. Must be a column in metadata or in |
shape_by |
Variable by which to color points. Must be a column in metadata or in |
point_alpha |
alpha value of |
point_rel_size |
relative size of |
show_plot |
Whether to show the plot or not |
rasterise |
Whether to rasterise the point, using ggrastr. |
... |
Other parameters passed on to ggrastr::rasterise |
If the metadata
or colData
of obj
contain a column colname
, this colum will be removed in the $pca_data
slot,
because this column contains the colnames of the data matrix. Similarly, for the $loadings
slot, the column rowname
is
reserved for the rownames of the data matrix.
The function displays the plot and returns invisible a list of the plot, the data.frame to make the plot, the vector of percentages of variance explained and the loadings matrix.
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) plot_pca(data)
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) plot_pca(data)
Plot matrix of PCA scatter plots
plot_pca_scatters( obj, n_PCs = min(10, nrow(obj), ncol(obj)), show_var_exp = T, n_feats = 500, scale_feats = FALSE, na_frac = 0.3, metadata = NULL, color_by = NULL, shape_by = NULL, point_alpha = 0.7, point_rel_size = 2, transpose = FALSE, rasterise = FALSE, ... )
plot_pca_scatters( obj, n_PCs = min(10, nrow(obj), ncol(obj)), show_var_exp = T, n_feats = 500, scale_feats = FALSE, na_frac = 0.3, metadata = NULL, color_by = NULL, shape_by = NULL, point_alpha = 0.7, point_rel_size = 2, transpose = FALSE, rasterise = FALSE, ... )
obj |
A (features x samples) matrix or SummarizedExperiment object |
n_PCs |
Number of principal components to plot |
show_var_exp |
Whether to show a plot of the percentage of variance explained by each PC in the bottom left corner. |
n_feats |
Number of top-variable features to include. |
scale_feats |
Whether to scale the features. |
na_frac |
Only consider features with the stated maximum fraction of NAs or NaNs. NA/NaNs will be mean-imputed for PCA. |
metadata |
A data.frame used for annotating samples. |
color_by |
Variable by which to color points. Must be a column in metadata or in |
shape_by |
Variable by which to color points. Must be a column in metadata or in |
point_alpha |
alpha value of |
point_rel_size |
relative size of |
transpose |
Wheter to transpose the whole matrix of scatter plots |
rasterise |
Whether to rasterise the points using ggrastr. |
... |
Other parameters passed on to ggrastr::rasterise |
The function displays the scatter plots of the PCs
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) plot_pca_scatters(data)
set.seed(1) data <- matrix(rnorm(100*6), ncol=6) data <- t(t(data)+c(-1, -1.1, -1.2, 1, 1.1, 1.2)) plot_pca_scatters(data)
Plot clustering of samples in a distance heatmap
plot_sample_clustering( se, n_feats = 500, anno_vars = NULL, anno_title = "group", distance = "euclidean", ... )
plot_sample_clustering( se, n_feats = 500, anno_vars = NULL, anno_title = "group", distance = "euclidean", ... )
se |
A SummarizedExperiment object. |
n_feats |
Number of top-variable features (genes) to consider |
anno_vars |
Character vector of columns in |
anno_title |
The title of the color legend for |
distance |
The type of distance metric to consider. Either 'euclidean', 'pearson' or 'spearman' |
... |
Other arguments passed on to ComplexHeatmap::Heatmap() |
A Heatmap-class object of the ComplexHeatmap
package that contains the heatmap of pairwise sample distances.
library("DESeq2") dds <- makeExampleDESeqDataSet(m=8, interceptMean=10) vsd <- vst(dds) plot_sample_clustering(vsd)
library("DESeq2") dds <- makeExampleDESeqDataSet(m=8, interceptMean=10) vsd <- vst(dds) plot_sample_clustering(vsd)
For each level of the grouping variable, the gene-wise median over all samples is computed to obtain a reference sample. Then, each sample is plotted against the reference.
plot_sample_MAs(vsd, group, y_lim = 3, rasterise = FALSE, ...)
plot_sample_MAs(vsd, group, y_lim = 3, rasterise = FALSE, ...)
vsd |
An object generated by |
group |
A grouping variable, must be a column of |
y_lim |
Y-axis limits, the axis will run from |
rasterise |
Whether to rasterise the points using ggrastr. |
... |
Other parameters passed on to ggrastr::rasterise |
A list of ggplot objects of the ggplot2 package, with each element corresponding to one MA-plot.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10) colData(dds)$type <- c("A","A","B","B") vsd <- vst(dds) plot_sample_MAs(vsd, group="type")
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10) colData(dds)$type <- c("A","A","B","B") vsd <- vst(dds) plot_sample_MAs(vsd, group="type")
Plot the distribution of the total number of counts per sample as histogram.
plot_total_counts(dds, n_bins = 50)
plot_total_counts(dds, n_bins = 50)
dds |
A DESeqDataSet |
n_bins |
Number of histogram bins |
A ggplot object of the ggplot2 package that contains the histogram of total counts per sample.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(m=30) plot_total_counts(dds)
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(m=30) plot_total_counts(dds)
For the given level, the gene-wise median over all samples is computed to obtain a reference sample. Then, each sample is plotted against the reference as MA-plot.
plot_within_level_sample_MAs( vsd, group, level, y_lim = 4, rasterise = FALSE, ... )
plot_within_level_sample_MAs( vsd, group, level, y_lim = 4, rasterise = FALSE, ... )
vsd |
An object generated by |
group |
A grouping variable, must be a column of |
level |
A level of the grouping variable |
y_lim |
Y-axis limits, the axis will run from |
rasterise |
Whether to rasterise the points using ggrastr. |
... |
Other parameters passed on to ggrastr::rasterise |
A list of ggplot objects of the ggplot2 package that contains for each sample of the specified level the the sample vs reference MA-plot.
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10) colData(dds)$type <- c("A","A","B","B") vsd <- vst(dds) plot_within_level_sample_MAs(vsd, group="type", level="A")
library("DESeq2") set.seed(1) dds <- makeExampleDESeqDataSet(n=1000, m=4, interceptMean=10) colData(dds)$type <- c("A","A","B","B") vsd <- vst(dds) plot_within_level_sample_MAs(vsd, group="type", level="A")
This function takes a list of plots as input and makes a pdf with ncol
x nrow
plots per page.
save_plots_to_pdf( plots, file = "plots.pdf", ncol, nrow, subfig_width = subfig_height * 16/9, subfig_height = 2.5, legend_position = "original" )
save_plots_to_pdf( plots, file = "plots.pdf", ncol, nrow, subfig_width = subfig_height * 16/9, subfig_height = 2.5, legend_position = "original" )
plots |
List of plots that is passed to the |
file |
file where the plots are saved |
ncol |
number of columns per page for the grid of plots |
nrow |
number of rows per page for the grid of plots |
subfig_width |
width of a plot of the grid in inches |
subfig_height |
height of a plot of the grid in inches |
legend_position |
either 'original' if the original legend of each sub-plot is shown, 'none', if no legend should be shown in any of the sub-plots, 'bottom', if no legend should be shown in the sub plots and one shared legend at the bottom or 'right', which is same as 'bottom', but shown on the right |
The function returns nothing but is called for it's side effect, which is to save a pdf of plots to the filesystem.
library("ggplot2") manuf <- unique(mpg$manufacturer) plots <- lapply(manuf, function(x){ df <- mpg[mpg$manufacturer==x,] ggplot(df, aes(cty, hwy)) + geom_point() + labs(title=x) }) save_plots_to_pdf(plots, ncol=3, nrow=2)
library("ggplot2") manuf <- unique(mpg$manufacturer) plots <- lapply(manuf, function(x){ df <- mpg[mpg$manufacturer==x,] ggplot(df, aes(cty, hwy)) + geom_point() + labs(title=x) }) save_plots_to_pdf(plots, ncol=3, nrow=2)
The dataset contains the read counts of experiment GSE89888 in which T47D cells with different mutation statuses were treated with E2 (estradiol) or vehicle.
T47D
T47D
A DESeqDataSet with 43576 rows (of genes) and 24 columns (of samples).
Differential expression results corresponding to the T47D data set.
T47D_diff_testing
T47D_diff_testing
A DESeqResults object with 36562 rows and 3 columns.
See the 'data' vignette on how to reproduce this object.