Commit a586140b authored by scapella's avatar scapella

Created repository, added README and subfolders from two other repositories

parents
set -e
## Try to set some flags
# -i : input file
# -o : output folder
# -p : output prefix
# -c : code folder with the script. Default './'
# -v : execute VEP locally or not
# -g : fasta reference file
# -t : transcript file :see example/chrx.txt
alias python='/home/mbosio/anaconda2/bin/python'
CODE='/home/mbosio/projects/rtt/code/'
CODE='./code/'
VEP='FALSE'
dbNSFP='/data/resources/vep/dbNSFP.gz'
dbscSNV='/data/resources/vep/dbscSNV1.1_GRCh38.txt.gz'
while getopts 'i:o:p:c:g:t:r:v' flag; do
case "${flag}" in
i) INFILE="${OPTARG}" ;;
o) OUTFOLDER="${OPTARG}" ;;
p) PREFIX="${OPTARG}" ;;
c) CODE="${OPTARG}" ;;
g) REF="${OPTARG}" ;;
t) TRANSCRIPT="${OPTARG}" ;;
v) VEP="TRUE" ;;
r) REGION="${OPTARG}" ;;
*) "Unexpected option ${flag}" ;;
esac
done
echo "############################################"
echo "#Step 1 : parse the data from excel to HGVS like "
echo "############################################"
python $CODE/parse_vars.py \
--infile $INFILE \
--outfile $OUTFOLDER/$PREFIX"_HGVS_parsed.txt" \
--region $REGION
#MECP2: NC_000023.11:g.,NM_004992.3:c.
echo "Done"
echo " "
echo "############################################"
echo "#Step 2 : extract only the parsed column for VCF parsing"
echo "############################################"
awk -F '\t' '{print $4}' < $OUTFOLDER/$PREFIX"_HGVS_parsed.txt" > $OUTFOLDER/$PREFIX"_HGVS_input.txt"
echo "Done"
echo " "
echo "############################################"
echo "#Step3 : parse to VCF"
echo "############################################"
python $CODE/HGVS_parse.py \
--infile $OUTFOLDER/$PREFIX"_HGVS_input.txt" \
--outfile $OUTFOLDER/$PREFIX".vcf" \
--ref $REF \
--transcript $TRANSCRIPT &> $OUTFOLDER/$PREFIX"_step3.log"
{ grep -P 'Error' $OUTFOLDER/$PREFIX".vcf" > $OUTFOLDER/$PREFIX".error.vcf" || test $? = 1; }
grep -v 'Error' $OUTFOLDER/$PREFIX".vcf" > $OUTFOLDER/$PREFIX".ok.vcf"
echo "Done"
echo " "
echo ">>>>>>>>> VCF to annotate : $OUTFOLDER/$PREFIX".ok.vcf" <<<<<<<<"
echo " "
echo $VEP
if [ "$VEP" = 'TRUE' ]; then
echo "############################################"
echo "#Step 4 : Annotate with VEP"
echo "############################################"
~/software/ensembl-vep/vep --cache \
-i $OUTFOLDER/$PREFIX".ok.vcf" \
-o $OUTFOLDER/$PREFIX"_annotated.txt" \
--everything \
--format vcf \
--flag_pick \
--force_overwrite \
--plugin dbNSFP,$dbNSFP,MetaLR_score,CADD_phred,fathmm-MKL_coding_score \
--plugin dbscSNV,$dbscSNV
echo "############################################"
echo "#Step 5 extract protein position for the variants"
echo "############################################"
python $CODE/parse_vep_for_protein_place.py $OUTFOLDER/$PREFIX"_annotated.txt" > $OUTFOLDER/$PREFIX"_R_full_input.tsv"
# grep -e 'Uploaded_variation' -e 'PICK' $OUTFOLDER/$PREFIX"_R_full_input.tsv" > $OUTFOLDER/$PREFIX"_R_picked_input.tsv"
head -n1 $OUTFOLDER/$PREFIX"_R_full_input.tsv" > $OUTFOLDER/$PREFIX"_R_picked_input.tsv"
awk '$20 == "1" { print $0 }' $OUTFOLDER/$PREFIX"_R_full_input.tsv" >> $OUTFOLDER/$PREFIX"_R_picked_input.tsv"
else
echo "############################################"
echo 'Step4-5 not executed. Add -v flag to run it'
echo "It requires to specify dbNSFP and dbscSNV plugins"
echo "############################################"
fi
# HGVSparse
Simple utility to parse HGVS to VCF (and optionally annotate them with local installation of `[Ensembl-vep](https://github.com/Ensembl/ensembl-vep)`)
### Usage example
For a quick example run the provided example.sh.
This tool requires to have a fasta reference file (chrX provided) and an annotated transcript file
The example focuses on MECP2 gene, here the GTF transcript file example, required to infer the mutation type:
0 NC_000023.11 X - 1 156040895 1 156040895 10 67092175,67096251,67103237,67111576,67113613,67115351,67125751,67127165,67131141,67134929, 6 7093604,67096321,67103382,67111644,67113756,67115464,67125909,67127257,67131227,67134971, 0 C1orf141 none none -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
The example also requires to specify region names to keep for parsing. In this case, we chose to process HGVS entries from `NC_000023.11:g.,NM_004992.3:c.` **[-r line in example.sh ]**
## example.sh ##
# 1st: unzip the fasta file
gzip -d example/chrX.fa.gz
# 2nd: make the output directory
mkdir -p example/results
# 3rd: run the conversion tool
sh HGVS_to_VEP.sh \
-i example/variants_in.txt \
-o example/results \
-p test \
-g example/chrX.fa \
-t example/chrx.txt \
-r NC_000023.11:g.,NM_004992.3:c.
#### Run with VEP local annotaion
This requires a local installation of Ensembl-vep with the appropriate cache and fasta references.
It also requires to have dbNSFP and dbscSNV plugins installed.
To add the VEP annotation, add a `-v` flag to the execution line of `HGVS_to_VEP.sh` call
import pyhgvs as hgvs
import pyhgvs.utils as hgvs_utils
from pygr.seqdb import SequenceFileDB
import sys
import argparse
parser = argparse.ArgumentParser(description = 'Parse HGVS to VCF')
parser.add_argument('--infile', type=str, dest='infile', required=True, help='Input file [required]')
parser.add_argument('--outfile', type=str, dest='outfile', required=True, help='Output [required]')
parser.add_argument('--ref', type=str, dest='ref', required=True, help='FASTA file with reference [required]')
parser.add_argument('--transcript', type=str, dest='transcript', required=True, help='transcript file [required]')
args = parser.parse_args()
# Read genome sequence using pygr.
#genome = SequenceFileDB('/data/resources/fasta/grch38/GRCh38.primary_assembly.genome.fa')
genome = SequenceFileDB(args.ref)
# Read RefSeq transcripts into a python dict.
#with open('/home/mbosio/projects/rtt/code/chrx.txt') as infile:
with open(args.transcript) as infile:
transcripts = hgvs.utils.read_transcripts(infile)
# Provide a callback for fetching a transcript by its name.
def get_transcript(name):
return transcripts.get(name)
with open(args.infile) as rd,open(args.outfile,'w') as wr:
wr.write('\t'.join(['#Chr','Start','End','Ref','Alt','Group']) + '\n')
for line in rd:
#print line.strip()
try:
a = hgvs.HGVSName(line.strip())
a.chrom='X'
outlist = hgvs.get_vcf_allele(a,genome)
outlist = [str(x) for x in outlist]
outlist.append('Cases')
wr.write('\t'.join(outlist)+'\n')
except :
wr.write('Error:%s %s \n'%(line.strip(),sys.exc_info()[0]))
print line.strip()
print sys.exc_info()[0]
pass
import sys
import itertools
with open(sys.argv[1]) as rd:
for line in rd:
print line
a = ["".join(x) for _, x in itertools.groupby(line.strip(), key=str.isdigit)]
print a
D C . . .
3 319780 . GA G . . .
19 110747 . G GT . . .
1 160283 sv1 . <DUP> . . SVTYPE=DUP;END=471362 .
1 1385015 sv2 . <DEL> . . SVTYPE=DEL;END=1387562 .
import sys
import re
import argparse
ID = 0
def RepresentsInt(s):
try:
int(s)
return True
except ValueError:
return False
def two_to_parse(Parts,offset= 0):
# check for parentheses
# if there's a parentesis adapt the logic to the part
# IF '?' or '*' is there :
# ?_*1000 --> will be 1000 - offset
# 1000? --> will be 1000 + offset
# Else : keep the number
# btw characters like '+' and '_' are removed from the scope
vals = ['?','=']
if '(' in Parts[0]:
begin = Parts[0].split('(')[1].split(')')[0]
else:
begin = Parts[0]
if '(' in Parts[1]:
ending = Parts[1].split('(')[1].split(')')[0]
else:
ending = Parts[1]
if any([x in vals for x in begin]):
# there are '?' or * in the part to parse
begin=begin.replace('?','').replace('*','').replace('+','')
tmp = begin.split('_')
# Keep the max value in case of (a_B)_( )dup
if RepresentsInt(tmp[1]):
begin = tmp[1]
else :
begin = str(int(begin.split('_')[0]) + offset)
elif '_' in begin :
begin = str(int(begin.split('_')[1]) + offset)
# # Keep the min value for beginning
# if RepresentsInt(tmp[0]):
# begin = tmp[0]
# else :
# begin = str(int(begin.split('_')[1]) - offset)
# elif '_' in begin :
# begin = str(int(begin.split('_')[0]) - offset)
if any([x in vals for x in ending]):
# there are '?' or * in the part to parse
ending=ending.replace('?','').replace('*','').replace('+','')
tmp = ending.split('_')
# Keep the min value for ending
if RepresentsInt(tmp[0]):
ending = tmp[0]
else:
ending = str(int(ending.split('_')[1]) - offset)
elif '_' in ending :
ending = str(int(ending.split('_')[0]) ) #take the min value when xxx_(a_b)dup
# #keep max value for ending
# if RepresentsInt(tmp[1]):
# ending = tmp[1]
# else:
# ending = str(int(ending.split('_')[0]) + offset)
# elif '_' in ending :
# ending = str(int(ending.split('_')[1]) ) #take the min value when xxx_(a_b)dup
return '_'.join([begin,ending])
def one_to_parse(variant,offset=10):
vals = ['?','=']
variant= variant.split('(')[1].split(')')[0]
ending = variant.split('_')[1]
if any([x in vals for x in variant]):
# there are '?' or * in the part to parse
#this just stays tge same because it's something like (?_100del)
value = str(int(ending) - offset)
return '_'.join([value,ending])
print "This tool is required to put variants directly in format HGVS without uncertainty to be used later with hgvs parser.py and produce a VCF"
print "Status\tID\toriginal variant\tParsed variant\tVCF variant"
parser = argparse.ArgumentParser(description = 'Parse HGVS to VCF')
parser.add_argument('--infile', type=str, dest='infile', required=True, help='Input file [required]')
parser.add_argument('--outfile', type=str, dest='outfile', required=True, help='Output [required]')
parser.add_argument('--region', type=str, dest='reg', required=True, help='comma separated list of transcript IDs or Gene IDs to keep [required]')
args = parser.parse_args()
myregion = args.reg.split(',')
#defregion = "NC_000023.11:g.,NM_004992.3:c."
with open(args.infile) as rd,open(args.outfile,'w') as wr:
for line in rd:
ID +=1
# For us it's not interesting at the moment the allel specific annotation
# So we will remove the '[]'
# and also substitute (;) with ;
#print ID
#Step 1 Isolate the genomic regio
region = [x for x in myregion if x in line]
if len(region) ==1:
region = region[0]
else:
print 'Error'
if len(region) > 1:
print 'Multiple regions from %s appear '%(args.reg)
print line
raise
else:
print 'No regions from %s appear '%(args.reg)
print line
raise
# region='NC_000023.11:g.'
# if region in line:
# pass
# elif 'NM_004992.3:c.' in line:
# region ='NM_004992.3:c.'
# else:
# print 'adjust the script because only supports %s and %s as regions '%(region, 'NM_004992.3:c.')
# print 'exiting'
# print line
# raise
variants = line.strip().split(region)[1].replace('[','').replace(']','').replace('(;)',';').split(';')
#Now _ variant by variant
# Variant is composed of Part1, Part2 and Type (e.g. delinsXXX)
# The format is : Part1_Part2Type
# Whenever we have uncertainty about one of the Parts, the part should be
# bewteen parenthesis (Part1)_(Part2)Type
#print variants
for i in range(0,len(variants)):
FinalID = "S%d_%d"%(ID,i+1)
try:
#Part1,Part2 = variants[i].split('_')
r = re.compile(r'(?:[^_(]|\([^)]*\))+')
Parts = r.findall(variants[i])
#print Parts
if len(Parts) ==2 :
tmp = two_to_parse(Parts)
elif len(Parts) == 1 and '(' in Parts[0] and ')' in Parts[0]:
tmp = one_to_parse(Parts[0])
else:
wr.write("KK\t%s\t%s\t%s\n"%(FinalID,variants[i],''.join([region,variants[i]]))) # nothing I can do here
continue
#now we have the tmp string which is START_END
# we have to print out region,start_end,type
try:
type_var = variants[i].split('_')[-1].split(')')[1]
except:
type_var =''
#print variants[i]
# if '154032268G>A' in variants[i]:
# print variants[i]
# print FinalID
# print region
# print Parts
# raise
wr.write("OK\t%s\t%s\t%s\n"%(FinalID,''.join([region,variants[i]]),''.join([region,tmp,type_var] )))
except:
wr.write("KO\t%s\t%s\t%s\n"%(FinalID,''.join([region,variants[i]]),variants[i]))
import sys
def parse_line(line,prot_idx,extra_fields,offset=2):
ff=line.strip().split('\t')
if '_' in ff[0]:
ff[0] = ff[0].split('_')[1]
ID= ':'.join([ff[0],ff[1],ff[2]])
gene=ff[3]
tx = ff[4]
extra = ff[-1]
ff.pop()
ff.insert
protein_pos = ff[prot_idx]
#this can be :
# just number 33
# N-N
# ?-N
# N-?
# ?
# -
start = 'NA'
end = 'NA'
if '-' in protein_pos:
if protein_pos =='-':
pass
else:
pp = protein_pos.split('-')
if pp[0] =='?':
end = int(pp[1])
start = max(0,end -offset)
elif pp[1] =='?':
start = int(pp[0])
end = start + offset
else: #both numbers
start = int(pp[0])
end = int(pp[1])
else :
#just number
start = int(protein_pos)
end = int(protein_pos)
start = str(start)
end = str(end)
## extra fields:
extra_list= extra.split(';')
extra_output = ['NA']*len(extra_fields)
for i in extra_list :
key = i.split('=')[0]
if key=='SIFT' or key=='PolyPhen':
val = i.split('=')[1].split('(')[1].replace(')','')
else:
val = i.split('=')[1]
extra_output[extra_fields.index(key)] = val
#print extra_output
ff.extend(extra_output)
#Add items
ff[prot_idx] = end
ff.insert(prot_idx,start)
ff.insert(0,ID)
#add start and end
outf=ff
#outf= [ID,gene,tx,start,end]
return outf
prot_idx = 9
with open(sys.argv[1]) as rd:
extra_fields = []
switch = False
for line in rd:
if line.startswith('##') :
if '## Extra column keys:' in line:
switch = True
elif switch:
extra_fields.append(line.split(' ')[1])
elif line.startswith('#'):
line = line.replace('#','')
ff = line.strip().split('\t')
prot_idx = ff.index('Protein_position')
ff[prot_idx] = 'Protein_start\tProtein_end'
ff.pop()
ff.extend(extra_fields)
print('ID\t'+'\t'.join(ff))
else:
print('\t'.join(parse_line(line,prot_idx,extra_fields)))
NC_000023.11:g.154031470G
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
"""
Models for representing genomic elements.
"""
from collections import namedtuple
class Position(object):
"""A position in the genome."""
def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand):
self.chrom = chrom
self.chrom_start = chrom_start
self.chrom_stop = chrom_stop
self.is_forward_strand = is_forward_strand
def __repr__(self):
return "<Position %s[%d:%d]>" % (
self.chrom, self.chrom_start, self.chrom_stop)
class Gene(object):
def __init__(self, name):
self.name = name
class Transcript(object):
"""RefGene Transcripts for hg19
A gene may have multiple transcripts with different combinations of exons.
"""
def __init__(self, name, version, gene, tx_position, cds_position,
is_default=False, exons=None):
self.name = name
self.version = version
self.gene = Gene(gene)
self.tx_position = tx_position
self.cds_position = cds_position
self.is_default = is_default
self.exons = exons if exons else []
@property
def full_name(self):
if self.version is not None:
return '%s.%d' % (self.name, self.version)
else:
return self.name
@property
def is_coding(self):
# Coding transcripts have CDS with non-zero length.
return (self.cds_position.chrom_stop -
self.cds_position.chrom_start > 0)
@property
def strand(self):
return ('+' if self.tx_position.is_forward_strand else '-')
@property
def coding_exons(self):
return [exon.get_as_interval(coding_only=True)
for exon in self.exons]
BED6Interval_base = namedtuple(
"BED6Interval_base", (
"chrom",
"chrom_start",
"chrom_end",
"name",
"score",
"strand"))
class BED6Interval(BED6Interval_base):
def distance(self, offset):
"""Return the distance to the interval.
if offset is inside the exon, distance is zero.
otherwise, distance is the distance to the nearest edge.
distance is positive if the exon comes after the offset.
distance is negative if the exon comes before the offset.
"""
start = self.chrom_start + 1
end = self.chrom_end
if start <= offset <= end:
return 0
start_distance = start - offset
end_distance = offset - end
if abs(start_distance) < abs(end_distance):
return start_distance
else:
return -end_distance
class Exon(object):
def __init__(self, transcript, tx_position, exon_number):
self.transcript = transcript
self.tx_position = tx_position
self.exon_number = exon_number
@property
def get_exon_name(self):
return "%s.%d" % (self.transcript.name, self.exon_number)
def get_as_interval(self, coding_only=False):
"""Returns the coding region for this exon as a BED6Interval.
This function returns a BED6Interval objects containing position
information for this exon. This may be used as input for
pybedtools.create_interval_from_list() after casting chrom_start
and chrom_end as strings.
coding_only: only include exons in the coding region
"""
exon_start = self.tx_position.chrom_start
exon_stop = self.tx_position.chrom_stop
# Get only exon coding region if requested
if coding_only:
if (exon_stop <= self.transcript.cds_position.chrom_start or
exon_start >= self.transcript.cds_position.chrom_stop):
return None
exon_start = max(exon_start,
self.transcript.cds_position.chrom_start)
exon_stop = min(
max(exon_stop, self.transcript.cds_position.chrom_start),
self.transcript.cds_position.chrom_stop)
return BED6Interval(
self.tx_position.chrom,
exon_start,
exon_stop,
self.get_exon_name,
'.',
self.strand,
)
@property
def strand(self):
strand = '+' if self.tx_position.is_forward_strand else '-'
return strand
This diff is collapsed.
This diff is collapsed.
chr11 17496507 17496508 T
chr11 17498251 17498252 G
chr11 17418842 17418843 G
chr11 17464265 17464266 C
chr11 17464265 17464266 C
chr11 17464265 17464266 C
chr11 17452525 17452526 T
chr11 17452579 17452580 C
chr11 17450106 17450107 C
chr11 17449509 17449510 C
chr11 17449410 17449411 T
chr11 17449412 17449413 C
chr11 17449411 17449412 A
chr17 7126027 7126037 AGCAGAGGTG
chr17 7126182 7126185 CGG
chr11 108004587 108004593 TTTTTT
chr20 43251291 43251296 ATCTG
chr17 48246449 48246452 CAG
chr2 241808282 241808285 ATG
chr2 241808282 241808285 ATG
chr17 41242937 41242959 GCACACACACACACGCTTTTTA
chr1 76216233 76216235 AA
chr2 241808727 241808729 GG
chr5 112163692 112163694 AC
chr5 112175545 112175547 AG
chr5 112175545 112175547 AG
chr1 76199215 76199222 GTCTTGG
chr1 76199196 76199242 CATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGCTT
chr1 76199184 76199214 AATGTGTTGAAACATTTTGATACTGTAGGA
chr1 76199220 76199250 GGACTTGGAACTTTTGATGCTTGTTTAATT
chr1 76199231 76199232 T
chr1 76199212 76199253 GAGGTCTTGGACTTGGAACTTTTGATGCTTGTTTAATTAGT
chr1 76199202 76199232 GATACTGTAGGAGGTCTTGGACTTGGAACT
chr1 76199232 76199262 TTTGATGCTTGTTTAATTAGTGAAGAATTG
chr1 76199267 76199274 TGGATGT
chr1 76199248 76199294 TTAGTGAAGAATTGGCTTATGGATGTACAGGGGTTCAGACTGCTAT
chr1 76199237 76199267 TGCTTGTTTAATTAGTGAAGAATTGGCTTA
chr1 76199273 76199303 TACAGGGGTTCAGACTGCTATTGAAGGAAA
chr1 76200516 76200520 GAAG
chr1 76200497 76200540 GAAATGATCAACAAAAGAAGAAGTATTTGGGGAGAATGACTGA
chr1 76200481 76200511 CCTATTATTATTGCTGGAAATGATCAACAA
chr1 76200514 76200544 AAGAAGTATTTGGGGAGAATGACTGAGGAG
chr1 76200516 76200520 GAAG
chr1 76200497 76200540 GAAATGATCAACAAAAGAAGAAGTATTTGGGGAGAATGACTGA