Commit 3aef3522 authored by María Morales Martínez's avatar María Morales Martínez
Browse files

remove .ipynb_checkpoints foder

parent 61a7f435
import os
import shutil
import pandas as pd
from paths import *
# This script is out from the general analysis pipeline, because it is used to generate the lamin bed files that the pipeline use to determinate the ranges in which the input chromatin bed will be divided in the nuclei.
# Besides the orginal bed with Lamin B1 position,
# these parammeters limits de genomic distance to lamin of each range
LAD_ranges = [0, 250000, 1000000, 2500000, 5000000]
LAD_names = ["0kb", "250kb", "1000kb", "2500kb", "5000kb"]
# Create dir to save bed files
dir_lamin = os.path.join(lamin_b1, "window")
IsideLAD = os.path.join(dir_lamin, "Lamin_B1.InsideLAD.bed")
# Copy original Lamin B1 bed in dir
if not os.path.exists(dir_lamin):
os.makedirs(dir_lamin)
shutil.copy(os.path.join(lamin_b1, "Lamin_B1.bed"), IsideLAD)
# Loop to generate new lamin beds
for dist in range(0, len(LAD_ranges)):
if dist == 0:
continue
# Open original lamin bed
fnam = pd.read_csv(IsideLAD, sep = "\t", header=None,
names = ['Chromosome', 'Start', 'End'])
# Extend the chromatin space occuped by lamin B1 by their ends according to the established ranges.
fnam['Start'] = fnam['Start'] - LAD_ranges[dist]
fnam['End'] = fnam['End'] + LAD_ranges[dist]
fnam.loc[fnam.Start < 0, 'Start'] = 0
nf = os.path.join(dir_lamin, "Lamin_B1." + LAD_names[dist]+ ".bed")
# Save modified lamin bed
fnam.to_csv(nf, header = False, sep = "\t", index = False)
\ No newline at end of file
# Histone marks distribution in chromatin beds
<div style="text-align: justify">
This pipeline is dedicated to estimate the tendency of histone marks (or other possible transcription factor) in different established regions along the nuclei. Chromatin extension is divided acording to radial "artificial" regions from the periphery (Lamin) to center of the nuclei and statistics from the distribution of each acetylation/methylation mark in those regions are calculated (mean and std). These regions are defined by their distances that moves away to Lamin B1 protein.
</div>
## Requirements:
Before running the pipeline is necessary:
### Data preparation
#### Histone Chip-seq
Zipped Chip-seq data must be storaged in */data/chip-seq/unzip*.
List of chip-seq bed files that are going been used to check the chromatin. Grace to this list, it does not mutter if you have several replicates from same sample o more chip-seq files than they required, because only will be selected those which are in the list.
#### Input Chromatin Bed
Chromatin input bed must be storage in */data/chromatin_bed/[category]/* (created by you).
Establishing different categories avoid mix different types of bed files.
### Installation
This tool is programmed under python==3.7.8.
To ensure the success of the installation of all packages and dependencies, it is necesary activate the virtual environment.
```
source venv/bin/activate
```
Complete environment packeges are available in **requirements.txt**. It is possible install directly the packages running the next command:
```
pip install -r requirements.txt
```
## Usage:
Execute the following command adding the required information for the arguments.
```
python run_smk.py [-h] -c CAT -b BED -s SMK -l LIST
```
- **Category (CAT)**: category or type of input bed. This argument identifies the type of mapped DNA elements to which the chosen input bed belongs.
- **Input bed (BED)**: This bed is divided into different ranges of genomic distance from the periphery to the center of the nuclei. The genomic distance that defines these ranges is: In contact with Lamin B1, 0kb-250kb, 250kb-1000kb, 1000kb-2500kb, 2500kb-5000kb, Center. The acetylation/methylation content of this bed in each range will be evaluated according to the marks that overlap with the input bed.
- **Snakemake workflow (SMK)**: snakefile with the processing rules applied to the data. In sanakemake folder.
- **List of selected bed files (LIST)**: selected hisntone mark samples to be overlapped with the input bed. In marklists folder.
*Quick summary:*
```
python run_smk.py -h
```
For more detailled information, consult attached documentation.
## Example
The tool provides an example to guarantee the perfect execution that can be carried out by the user.
In the next command example, it is indicated the type of chromatin bed that is goingo to be divided in regions are neutral regions, the bed inside this category, the order of pipeline running and the list with the elements that will be used in the analysis
```
python run_smk.py -c nre -b hg38_NRE_filt5 -s processing.smk -l selected_files_ids.txt
```
There is another input data file for highly expressed genes in category transcription that can be run by the same user.
## Next updates:
- Upload complete documentation of all scripts (scheme required)
- Change structure to separate types of histone chip-seq used according the source of each dataset.
- Change command line parameters to config file witho all the parameters.
import os
# PATHS: dirs
# ------------------------------------------------------------------------------
wd = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
# Chromatin states
chromatin = os.path.join(wd, 'data', 'chromatin_beds')
# Lamins
lamin = os.path.join(chromatin, 'lamin')
lamin_b1 = os.path.join(lamin, 'lamin_B1')
# Marks
zip_marks = os.path.join(wd, 'data', 'chip-seq', 'zip_marks')
unzip_marks = os.path.join(wd, 'data', 'chip-seq', 'unzip_marks')
all_marks = os.path.join(unzip_marks, 'all')
sel_marks = os.path.join(unzip_marks, 'selected')
# Snakemake
sanakemakepath = os.path.join(wd,'pipelines','snakemake')
# List of Samples to be used
marklistspath = os.path.join(wd, 'pipelines', 'marklists')
\ No newline at end of file
import os
import sys
import pandas as pd
import numpy as np
import pyranges as pr
from natsort import natsorted
from paths import *
# FUNCTIONS
#-------------------------------------------------------------------------
def SubsetByState(df, destination):
states = np.unique(df['State'])
dir_state = os.path.join(os.path.dirname(destination), 'states')
if not os.path.isdir(dir_state):
os.makedirs(dir_state)
for state in states:
df = df[df['State'].isin(state)]
# WORKFLOWS
#-------------------------------------------------------------------------
category = sys.argv[1]
bed = sys.argv[2]
print("Selected category:", category, " and selected bed:", bed)
# Create list to filter
chr_list = ['chr' + str(i) for i in range(1,23)]
chr_list.append('chrX')
chr_list.append('chrY')
## Chromatin beds
chromatin_path = os.path.join(chromatin, category, bed, bed + ".bed")
chromatin_df = pd.read_csv(chromatin_path, sep = "\t", header=None)
if category == "nre":
chromatin_df.columns = ['Chromosome', 'Start', 'End']
if category == "transcription":
chromatin_df.columns = ['Chromosome', 'Start', 'End', 'Genes', 'Exp']
if category == "ChromHMM":
chromatin_df.columns = ['Chromosome', 'Start', 'End'] # revisar
# Remove alternative elements
chromatin_df = chromatin_df[chromatin_df['Chromosome'].isin(chr_list)]
chromatin_df.to_csv(os.path.join(chromatin, category, bed, bed + "_clean.bed"),
header = False, index = False, sep = "\t")
## Mark beds
for mark in os.listdir(sel_marks):
if len(os.listdir(os.path.join(sel_marks,mark))) == 2:
continue
mark_bed = os.listdir(os.path.join(sel_marks,mark))
markbed_path = os.path.join(sel_marks, mark, " ".join(str(i) for i in mark_bed))
mark_df = pd.read_csv(markbed_path, sep = "\t", header=None, usecols = [0,1,2])
mark_df.columns = ['Chromosome', 'Start', 'End']
# Filter Dataframe
mark_df = mark_df[mark_df['Chromosome'].isin(chr_list)]
# Sort Dataframe
mark_gr = pr.PyRanges(mark_df)
mark_gr = mark_gr.sort()
mark_gr.to_csv(os.path.join(sel_marks, mark, mark + ".bed"),
header = False, sep = "\t")
snakemake==5.27.4
argparse==1.1
natsort==7.0.1
pandas==1.1.3
numpy==1.19.1
pyranges==0.0.89
matplotlib==3.3.2
seaborn==0.11.0
import os
import sys
import argparse
from paths import *
from snakemake import snakemake
# Construct the argument parser
ap = argparse.ArgumentParser()
# Add the arguments to the parser
ap.add_argument("-c", "--cat", required=True,
help="type of bed to compare")
ap.add_argument("-b", "--bed", required=True,
help="bed file used for the operation")
ap.add_argument("-s", "--smk", required=True,
help="snakemake script")
ap.add_argument("-l", "--list", required=True,
help="list of marks selected to the operation")
args = ap.parse_args()
if args.cat:
category = args.cat
bed = args.bed
smk = os.path.join(sanakemakepath, args.smk)
mark_list = os.path.join(marklistspath, args.list)
# Run snakemake rules and
snk_status = snakemake(smk,
config = {"category": category, "bed": bed, "list": mark_list},
lock = False, targets = ['statistics'],
cores = 10,
force_incomplete = True)
\ No newline at end of file
import os
import sys
import shutil
from paths import *
# List of selected samples to be used in the run
mark_list = sys.argv[1]
# Remove samples from previous process
shutil.rmtree(sel_marks)
# Copy all files in selected_files_ids.txt in selected_beds
with open(mark_list,'r') as list:
for rows in list:
columns = rows.split(";")
ID = columns[0]
mark = columns[1][:-1]
# All unzip beds
source = os.path.join(all_marks, ID + ".bed")
# Clasification by mark
destination = os.path.join(sel_marks, mark.split("-")[0], ID + ".bed")
# Create folders for each mark
if not os.path.isdir(os.path.dirname(destination)):
os.makedirs(os.path.dirname(destination))
#Copy selected beds from sample list
print("copying selected file: ", ID + ".bed for mark: ", mark, " to ordered folders")
shutil.copyfile(source, destination)
import os
import sys
import time
import datetime
import pandas as pd
import numpy as np
import pyranges as pr
from paths import *
category = sys.argv[1]
bed = sys.argv[2]
bed_path = os.path.join(chromatin, category, bed)
# lamin places
intervals = {'InsideLAD':'InsideLAD','250kb':'0kb-250kb', '1000kb':'250kb-1000kb',
'2500kb':'1000kb-2500kb', '5000kb':'2500kb-5000kb', 'Center':'Center'}
LAD_names = list(intervals.keys())
# Normalization
def div_to_val(df, **kwargs):
col = kwargs['divisor']
df.loc[:, 'Val'] = df['Val'].div(df[col], axis = 0)
return df
# Get marks path
marcas = {}
for rep in os.listdir(sel_marks):
marcas[rep] = []
for fnam in os.listdir(os.path.join(sel_marks, rep)):
if not fnam.startswith('H'):
continue
marcas[rep].append(os.path.join(sel_marks, rep, fnam))
# Open chromatin bed:
chromatin_path = os.path.join(chromatin, category, bed)
for f in os.listdir(chromatin_path):
if f == 'state':
chromatin_path = os.path.join(chromatin_path, 'state')
# Rellenar si se incluyen más estados
if f.endswith('_clean.bed'):
file = os.path.join(chromatin_path, f)
chromatin_gr = pr.read_bed(file)
# Get ovelapped intervals and normalization by range distance:
val_marks = {}
for mark in marcas.keys():
for fnam in marcas[mark]:
mark_df = pr.read_bed(fnam)
gr = chromatin_gr.count_overlaps(mark_df, overlap_col="Val")
if category == "transcription":
val_marks[mark] = gr.apply(div_to_val, divisor='Score')
cols = ['Chromosome', 'Start','End','Genes','Exp','Val','Lamin']
else:
gr.Lengths = gr.lengths()
val_marks[mark] = gr.apply(div_to_val, divisor='Lengths')
cols = ['Chromosome', 'Start', 'End', 'Val', 'Lengths', 'Lamin']
# This block is dedicated to the construction of a new chromatin bed with all their intervals classified according to the place occupated in chromatin respectively to Lamin B1 and the numbers of histone marks loceted in them for each mark.
range_LADs = {}
for mark in marcas.keys():
range_LADs[mark] = { i : [] for i in LAD_names }
for ind, win in enumerate(LAD_names):
results = os.path.join(bed_path, "results")
val_path = os.path.join(results, 'values')
stat_path = os.path.join(results, 'statistics')
# Create Results folders
if not os.path.exists(results):
for dir in [results, val_path, stat_path]:
os.makedirs(dir)
# Center is the last step of the analysis
if win is not 'Center':
fnam_LAD = os.path.join(lamin_b1,'window', "Lamin_B1." + win + ".bed")
LAD_df = pr.read_bed(fnam_LAD)
range_LADs[mark][win] = {'LAD':[], 'noLAD':[]}
# Marks which ovelapps current chromatin region in each space determinated by the distance to Lamin
## Initial stating range
if win is 'InsideLAD':
gr = val_marks[mark].count_overlaps(LAD_df,
overlap_col="Lamin_count")
df = gr.df
df['Lamin'] = "noLAD"
## Nexts ranges towards the center
else:
gr = pr.PyRanges(range_LADs[mark][LAD_names[ind-1]]['noLAD'])
gr = gr.count_overlaps(LAD_df, overlap_col="Lamin_count")
df = gr.df
# Filtering those chromatin region whitn al least one mark in the region that we are checking
df.loc[df['Lamin_count'] > 0, 'Lamin'] = intervals[win]
df = df.drop('Lamin_count', axis=1)
# Reset index of dataframe
noLAD = df.loc[df['Lamin'] =="noLAD"].reset_index(drop=True)
LAD = df.loc[df['Lamin'] ==intervals[win]].reset_index(drop=True)
# Save the subdataframe for each lamin place and the rest of intervals to be re-checked for other lamin places
range_LADs[mark][win]['noLAD'] = noLAD
range_LADs[mark][win]['LAD'] = LAD
else:
df = range_LADs[mark][LAD_names[ind-1]]['noLAD']
df['Lamin'] = win
range_LADs[mark][win] = df
# Concat all subdataframes in a unique datrame with al the classfied chromatin regions
for mark in marcas.keys():
if category == 'transcription':
dfVal = pd.DataFrame(columns=['Chromosome', 'Start', 'End',
'Genes', 'Exp', 'Val', 'Lamin'])
if category == 'nre':
dfVal= pd.DataFrame(columns=['Chromosome', 'Start', 'End',
'Val', 'Lengths', 'Lamin'])
dfStat = pd.DataFrame(columns=['Mark', 'Lamin', 'Mean', 'Std'])
for win in LAD_names:
if win is not 'Center':
df = range_LADs[mark][win]['LAD']
df.columns = cols
dfVal = pd.concat([dfVal,df])
else:
df = range_LADs[mark][win]
df.columns = cols
dfVal = pd.concat([dfVal,df])
# Statistics: mean and std by lamin space
#pd.set_option('display.float_format', lambda x: '%.5f' % x)
with open(os.path.join(stat_path, mark + "_stats.bed"),'w') as f:
for win in LAD_names:
val_by_range = dfVal.loc[dfVal['Lamin'] == intervals[win], 'Val']
mean_by_range = val_by_range.mean()
std_by_range = val_by_range.std()
print(mark, win, "mean: " + str(mean_by_range), "std: " + str(std_by_range))
f.write('{}\t{}\t{}\t{}\n'.format(mark, intervals[win], mean_by_range, std_by_range))
dfVal= dfVal.sort_values(['Chromosome', 'Start', 'End'],
ascending=True).reset_index(drop = True)
dfVal.to_csv(os.path.join(val_path, mark + "_val.bed"),
header=False, index= False, sep='\t')
import os
import sys
import seaborn as sns
import numpy as np
import pandas as pd
from paths import *
from matplotlib import pyplot as plt
category = sys.argv[1]
bed = sys.argv[2]
# Lamin places
windows = ['InsideLAD', '0kb-250kb', '250kb-1000kb',
'1000kb-2500kb', '2500kb-5000kb', 'Center']
marks = os.listdir(sel_marks)
sample = os.path.join(chromatin, category, bed)
results_path = os.path.join(sample, 'results', 'values')
norm_data = {}
for w in windows:
norm_data[w] = {}
for mark in os.listdir(results_path):
mark_path = os.path.join(results_path, mark)
dfVal = pd.read_csv(mark_path, sep = "\t", header=None)
if category == "nre":
dfVal.columns = ['Chromosomes','Start','End',
'Val','Lengths','Lamin']
if category == "transcription":
dfVal.columns = ['Chromosomes','Start','End', 'Genes',
'Exp', 'Val', 'Lamin']
# Normalization of histone marks number.
# Dvide the number of marks in each bed interval by the sum of all marks in all lamin places
m = mark.split("_")[0]
total = dfVal['Val'].sum()
norm_data[w][m] = dfVal.loc[dfVal['Lamin'] == w, 'Val'].div(total) * 1000
# Plot tendency
plt.figure(figsize=(12, 12))
for num, what in enumerate(['Acethylation', 'Methylation'], 1):
axe = plt.subplot(2,1,num)
ys = []
these_marks = sorted([m for m in marks if what[:2].lower() in m],
key=lambda x: np.mean(norm_data[windows[-1]][x]))
for n, m in enumerate(these_marks):
plt.plot(range(len(windows)),
[np.mean(norm_data[w][m]) for w in windows], 'o-',
alpha=min(1, 0.1 + 1 * abs(np.mean(norm_data[windows[-1]][m]) - np.mean(norm_data[windows[0]][m]))),
color='k', lw=3, zorder=1000)
y = np.mean(norm_data[windows[-1]][m])
if category == 'transcription':
y2 = 0.05 + 0.8 * n / len(these_marks)
plt.ylim(0, 0.8)
if category == 'nre':
y2 = 0.05 + 7 * n / len(these_marks)
#plt.ylim(0, 6)
plt.text(len(windows) - 0.4, y2, m, va='center')
plt.plot([len(windows) - 0.9, len(windows) - 0.4], [y, y2], 'k:', lw=1)
_ = plt.xticks(range(len(windows)), windows)
ylim = plt.ylim()
plt.plot([0, len(windows) - 1], [0] * 2, 'k-')
plt.xlim(plt.xlim()[0], len(windows) + 2)
plt.title(what)
plt.ylabel('Normalized histone marks per CAGE-seq read')
axe.spines['right'].set_visible(False)
axe.spines['top'].set_visible(False)
axe.spines['bottom'].set_visible(False)
plot_path = os.path.join(sample, 'results', 'plot')
if not os.path.exists(plot_path):
os.makedirs(plot_path)
print("Generating plot...")
plt.savefig(os.path.join(plot_path, 'normalized_marks.png'))
print("Process finished")
import os
import gzip
import shutil
from paths import *
# Unzip bed files
def unCompress(path,id):
if not os.path.exists(path):
print("uncompressing file:" + id)
peaks_path = os.path.join(zip_marks, id + ".bed.gz")
with gzip.open(peaks_path, 'rb') as gz_ref:
with open(path,'wb') as gz_out:
print("unzip file: " + id + ".bed")
shutil.copyfileobj(gz_ref, gz_out)
else:
print("File "+ id + " already exists")
# List of mark files
with open(os.path.join(zip_marks,"metadata.tsv",),'r') as file:
next(file)
for lines in file: