Commit 6cba9fe1 authored by Mattia Bosio's avatar Mattia Bosio
Browse files

initial commit

parents
Developed and tested with R 3.5.1
dependencies:
library(IRanges)
library(optparse)
library(grid)
library(ggplot2)
library(gridExtra)
library(GenomicRanges)
library(ggbio)
library(reshape2)
Usage: Rscript plot_variants.R --cases [$CASES] --controls [$CONTROLS]
--vcf_cases [$VCFCASES] --vcf_controls [$VCF_CONTROLS]
--out [$OUTPREFIX]
Parameters :
CASES: tsv file with the VEP annotated causal mutations.
CONTROLS: tsv file with VEP annotated non-causal mutations
VCFCASES : Optional VCF file for the causal mutations
VCFCONTROLS : Optional VCF file for the non-causal mutations
OUT : Output prefix for the generated plots
Note: the domain plot annotation have hardcoded values for ENST00000453960
if you wish to use it for other transcripts or genes, these values need to be updated accordingly
#attempt plot info
library(IRanges)
library(optparse)
library(grid)
library(ggplot2)
library(gridExtra)
library(GenomicRanges)
library(ggbio)
library(reshape2)
#### input arguments
option_list = list(
make_option(c("-c", "--cases"), type="character", default=NULL,
help="cases dataset", metavar="character"),
make_option(c("-x", "--controls"), type="character", default=NULL,
help="control datasets, can be empty", metavar="character"),
make_option(c( "--vcf_cases"), type="character", default=NULL,
help="cases dataset VCF file", metavar="character"),
make_option(c( "--vcf_controls"), type="character", default=NULL,
help="control datasets, can be empty", metavar="character"),
make_option(c("-o", "--out"), type="character", default="out.txt",
help="output prefix [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
########
########
cases = read.table(opt$cases,header =T, stringsAsFactors = F )
cases$Group = 'RTT Causing'
vcf_cases = read.table(opt$vcf_cases,header =T,comment.char = "" )
vcf_cases$Group = 'Pathogenic'
vcf_cases$width = abs(vcf_cases$End- vcf_cases$Start ) +1
vcf_cases = vcf_cases[ order(-vcf_cases$width, vcf_cases$Start), ]
if (!is.null(opt$controls)){
controls = read.table(opt$controls,head=T, stringsAsFactors = F )
controls$Group = 'Benign'
all_p = rbind(cases,controls)
}else{
controls = NULL
all_p = cases}
if (!is.null(opt$vcf_controls)){
vcf_controls = read.table(opt$vcf_controls,head=T,comment.char = "" )
vcf_controls$Group = 'Benign'
vcf_controls$width = abs(vcf_controls$End- vcf_controls$Start ) +1
all_g = rbind(vcf_cases,vcf_controls)
}else{
vcf_controls = NULL
all_g = vcf_cases}
####################################################################
###### dna position plot
####################################################################
gr = GRanges(seqnames ="chrX", IRanges(start=all_g$Start,end=all_g$End),
strand= "+",Group=all_g$Group)
#autoplot(gr, aes(color = Group, fill = Group), facets = Group ~ seqnames)
pdf(file=paste(opt$out,"_dna_plot", '.pdf', sep=""), width = 10, height = 10)
ggplot(gr,aes(color = Group, fill = Group)) +
stat_coverage(gr,main='s',
facets = Group ~ seqnames,
geom = 'area',
scales = "free")
dev.off()
####################################################################3
###### protein position plots
####################################################################3
all_p = all_p[all_p$Feature=='ENST00000453960',]
#### Domains and conserved regions definition
domains = data.frame(Domain = c('MBD','AT','TRDa','AT2','TRDb','His_rich','Pro_rich'),
Start = c(90,185,207,265,278,366,376),End = c(162,197,264,277,310,372,405))
conserved_regions = data.frame(Region=c('motif1','motif2','AT_hook1','motif3','motif4','AT_hook_2',
'motif5','motif5','hist_rich','motif7','motif8'),
Start = c(29,167,185,204,249,265,283,300,366,404,464),
End = c(37,180,194,226,256,272,294,312,372,409,489)
)
domains$Cases = apply(domains, 1, function(x) nrow(cases[(cases$Protein_start <= x[3]) & (cases$Protein_end >= x[2]),]) )
domains$Cases_len= domains$Cases/(domains$End- domains$Start)
conserved_regions$Cases = apply(conserved_regions, 1, function(x) nrow(cases[(cases$Protein_start <= x[3]) & (cases$Protein_end >= x[2]),]) )
conserved_regions$Cases_len= conserved_regions$Cases/(conserved_regions$End- conserved_regions$Start)
if (!is.null(controls)){
#selct cases common to both sets
both = cases[cases$ID %in% controls$ID,]
both$Group='Both'
#remove them from set
all_p = all_p[!(all_p$ID %in% both$ID),]
#add them again as both
all_p = rbind(all_p,both)
all_p$Group = as.factor(all_p$Group)
all_p$Group=factor(all_p$Group,levels(all_p$Group)[c(1,3,2)])
#all_p$Group=factor(all_p$Group,levels(all_p$Group)[c(2,1,3)])
#select only relevant columns from here on
# so we can use complete-cases to subselect rows
fields = c("ID","Group","Feature","Protein_start","Protein_end")
bkp =all_p
all_p = all_p[,fields]
##domains part[1st one only the count, 2nd has a division by length of region]
domains$Controls = apply(domains, 1, function(x) nrow(controls[(controls$Protein_start <= x[3]) & (controls$Protein_end >= x[2]),]) )
domains$Both = apply(domains, 1, function(x) nrow(both[(both$Protein_start <= x[3]) & (both$Protein_end >= x[2]),]) )
domains$Controls_len = domains$Controls/(domains$End- domains$Start)
domains$Both_len = domains$Both/(domains$End- domains$Start)
dfm = melt(domains[,c('Domain','Cases','Both','Controls')],id.vars = 1)
dfm2 = melt(domains[,c('Domain','Cases_len','Both_len','Controls_len')],id.vars = 1)
## conserved regions part [1st one only the count, 2nd has a division by length of region]
conserved_regions$Controls = apply(conserved_regions, 1, function(x) nrow(controls[(controls$Protein_start <= x[3]) & (controls$Protein_end >= x[2]),]) )
conserved_regions$Both = apply(conserved_regions, 1, function(x) nrow(both[(both$Protein_start <= x[3]) & (both$Protein_end >= x[2]),]) )
conserved_regions$Controls_len = conserved_regions$Controls/(conserved_regions$End- conserved_regions$Start)
conserved_regions$Both_len = conserved_regions$Both/(conserved_regions$End- conserved_regions$Start)
dfm_r = melt(conserved_regions[,c('Region','Cases','Both','Controls')],id.vars = 1)
dfm2_r = melt(conserved_regions[,c('Region','Cases_len','Both_len','Controls_len')],id.vars = 1)
}else{
dfm = melt(domains[,c('Domain','Cases')],id.vars = 1)
dfm2 = melt(domains[,c('Domain','Cases_len')],id.vars = 1)
dfm_r = melt(conserved_regions[,c('Region','Cases')],id.vars = 1)
dfm2_r = melt(conserved_regions[,c('Region','Cases_len')],id.vars = 1)
}
all_p = all_p[complete.cases(all_p),]
####################################################################3
##### Coverage plot of protein-affecting variants
####################################################################3
gr = GRanges(seqnames =all_p$Feature, IRanges(start=all_p$Protein_start,end=all_p$Protein_end),
strand= "+",Group=all_p$Group)
pdf(file=paste(opt$out,"_protein_plot", '.pdf', sep=""), width = 10, height = 10)
gg = ggplot(gr,aes(color = Group, fill = Group)) +
stat_coverage(gr,main='Mutation Coverage',
facets = Group ~ seqnames,
geom = 'area',
scales = "free")
gg +theme(text = element_text(size=20)) + scale_fill_manual(values=c ("#ff8c1a","#56B4E9","#CCCCCC") )
dev.off()
####################################################################3
## domains
####################################################################3
dfm$Domain = factor(dfm$Domain, levels = dfm$Domain)
p = ggplot(dfm, aes(x=factor(Domain), y=value, fill=factor(variable))) +
geom_bar(stat="identity", position="dodge") +
theme(legend.position="bottom") + scale_fill_manual(values=c ("#ff8c1a","#56B4E9","#CCCCCC") )
dfm2$Domain = factor(dfm2$Domain, levels = dfm2$Domain)
q = ggplot(dfm2, aes(x=factor(Domain), y=value, fill=factor(variable))) +
geom_bar(stat="identity", position="dodge") +
theme(legend.position="bottom") + scale_fill_manual(values=c ("#ff8c1a","#56B4E9","#CCCCCC") )
pdf(file=paste(opt$out,"_domains_plot", '.pdf', sep=""), width = 10, height = 10)
grid.arrange(p,q,ncol=1)#
dev.off()
####################################################################
# CONSERVED REGIONS PLOT
####################################################################
dfm_r$Region = factor(dfm_r$Region, levels = dfm_r$Region)
p = ggplot(dfm_r, aes(x=factor(Region), y=value, fill=factor(variable))) +
geom_bar(stat="identity", position="dodge") +
theme(legend.position="bottom") + scale_fill_manual(values=c ("#ff8c1a","#56B4E9","#CCCCCC") )
dfm2_r$Region = factor(dfm2_r$Region, levels = dfm2_r$Region)
q = ggplot(dfm2_r, aes(x=factor(Region), y=value, fill=factor(variable))) +
geom_bar(stat="identity", position="dodge") +
theme(legend.position="bottom")+ scale_fill_manual(values=c ("#ff8c1a","#56B4E9","#CCCCCC") )
pdf(file=paste(opt$out,"_conserved_regions_plot", '.pdf', sep=""), width = 10, height = 10)
grid.arrange(p,q,ncol=1)#
dev.off()
####################################################################
###### Boxplots of variants : MetaLR , CADD, fathmm ,
####################################################################
cols = c('PolyPhen', 'SIFT','CADD_phred','MetaLR_score','fathmm.MKL_coding_score','gnomAD_AF')
all_p=bkp
mm = melt(all_p, id=c('ID','Group'),measure=cols)
levels(mm$variable) = c('PolyPhen','SIFT','CADD','MetaLR','FATHMM.MKL','GnomAD AF')
pdf(file=paste(opt$out,"_boxplots", '.pdf', sep=""), width = 10, height = 7)
#
# ggplot(mm)+geom_boxplot(aes(x=Group, y=value,fill=Group))+
# stat_compare_means(method='wilcox.test') +
# facet_wrap(~variable, scales="free_y") + theme_bw() +
# scale_fill_manual(values=c("#ff8c1a","#56B4E9","#CCCCCC") ) + theme(legend.position = 'bottom')
my_comparisons <- list( c("Benign", "Shared"), c("Benign", "RTT Causing"), c("Shared", "RTT Causing") )
ggboxplot(mm,x='Group',y='value',fill='Group') + stat_compare_means(comparisons = my_comparisons,label = "p.signif") +
facet_wrap(~variable,scales ='free_y')+ theme_bw() +
scale_fill_manual(values=c("#ff8c1a","#56B4E9","#CCCCCC") ) + theme(legend.position = 'bottom')
dev.off()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment