Commit cd50aa8c authored by María Morales Martínez's avatar María Morales Martínez
Browse files

remove previous version

parent abfc068f
# Change Hi-C ranges
text = "hic_order = " # if any line contains this text, I want to modify the whole line.
new_text = "hic_order = " + "dddddddd"
pathRanges = "/home/bscuser/3DMetabolism/pipelines/ranges.py"
with open(pathRanges,"r") as f:
for line in f.readlines():
if text in line:
line = new_text
print(line)
f.close()
import os
import sys
import time
import datetime
import pandas as pd
import numpy as np
import pyranges as pr
from paths import *
chromatin_category = sys.argv[1]
chromatin_bed = sys.argv[2]
marks_dataset = sys.argv[3]
# lamin places
intervals = {'InsideLAD':'InsideLAD','250kb':'0kb-250kb', '1000kb':'250kb-1000kb',
'2500kb':'1000kb-2500kb', '5000kb':'2500kb-5000kb', 'Center':'Center'}
LAD_names = list(intervals.keys())
# Normalization
def div_to_val(df, **kwargs):
col = kwargs['divisor']
df.loc[:, 'Val'] = df['Val'].div(df[col], axis = 0)
return df
# Get marks path
marcas = {}
for rep in os.listdir(os.path.join(selpath, marks_dataset)):
marcas[rep] = []
for fnam in os.listdir(os.path.join(selpath, marks_dataset, rep)):
if not fnam.startswith('H'):
continue
marcas[rep].append(os.path.join(selpath, marks_dataset, rep, fnam))
# Open chromatin bed:
chromatin_dir = os.path.join(chromatinpath, chromatin_category, chromatin_bed)
for fnam in os.listdir(chromatin_dir):
if fnam.endswith('_clean.bed'):
file = os.path.join(chromatin_dir, fnam)
chromatin_gr = pr.read_bed(file)
# Get ovelapped intervals and normalization by ranges or distance
val_marks = {}
for mark in marcas.keys():
for fnam in marcas[mark]:
mark_df = pr.read_bed(fnam)
gr = chromatin_gr.count_overlaps(mark_df, overlap_col="Val")
if chromatin_category == "transcription":
val_marks[mark] = gr.apply(div_to_val, divisor='Score')
cols = ['Chromosome', 'Start','End','Genes','Exp','Val','Lamin']
else:
gr.Lengths = gr.lengths()
val_marks[mark] = gr.apply(div_to_val, divisor='Lengths')
cols = ['Chromosome', 'Start', 'End', 'Val', 'Lengths', 'Lamin']
# This block is dedicated to the construction of a new chromatin bed with all their intervals
# classified according to the place occupated in chromatin respectively to Lamin B1 and the numbers
# of histone marks loceted in them for each mark.
range_LADs = {}
for mark in marcas.keys():
range_LADs[mark] = { i : [] for i in LAD_names }
for ind, win in enumerate(LAD_names):
results = os.path.join(chromatin_dir, marks_dataset + "_results")
val_path = os.path.join(results, 'values')
stat_path = os.path.join(results, 'statistics')
# Create Results folders
if not os.path.exists(results):
for dir in [results, val_path, stat_path]:
os.makedirs(dir)
# Center is the last step of the analysis
if win is not 'Center':
fnam_LAD = os.path.join(lamin_b1,'window', "Lamin_B1." + win + ".bed")
LAD_df = pr.read_bed(fnam_LAD)
range_LADs[mark][win] = {'LAD':[], 'noLAD':[]}
# Marks which ovelapps current chromatin region in each space determinated by the distance to Lamin
## Initial stating range
if win is 'InsideLAD':
print("mark: " + mark + " LAD range: " + win)
gr = val_marks[mark].count_overlaps(LAD_df,
overlap_col="Lamin_count")
df = gr.df
df['Lamin'] = "noLAD"
## Nexts ranges towards the center
else:
print("mark: " + mark + " LAD range: " + win)
gr = pr.PyRanges(range_LADs[mark][LAD_names[ind-1]]['noLAD'])
gr = gr.count_overlaps(LAD_df, overlap_col="Lamin_count")
df = gr.df
# Filtering those chromatin region whitn al least one mark in the region that we are checking
df.loc[df['Lamin_count'] > 0, 'Lamin'] = intervals[win]
df = df.drop('Lamin_count', axis=1)
# Reset index of dataframe
noLAD = df.loc[df['Lamin'] =="noLAD"].reset_index(drop=True)
LAD = df.loc[df['Lamin'] ==intervals[win]].reset_index(drop=True)
# Save the subdataframe for each lamin place and the rest of intervals to be re-checked
# for other lamin places
range_LADs[mark][win]['noLAD'] = noLAD
range_LADs[mark][win]['LAD'] = LAD
else:
print("mark: " + mark + " LAD range: " + win)
df = range_LADs[mark][LAD_names[ind-1]]['noLAD']
df['Lamin'] = win
range_LADs[mark][win] = df
# Concat all subdataframes in a unique datrame with al the classfied chromatin regions
for mark in marcas.keys():
if chromatin_category == 'transcription':
dfVal = pd.DataFrame(columns=['Chromosome', 'Start', 'End',
'Genes', 'Exp', 'Val', 'Lamin'])
else:
dfVal= pd.DataFrame(columns=['Chromosome', 'Start', 'End',
'Val', 'Lengths', 'Lamin'])
dfStat = pd.DataFrame(columns=['Mark', 'Lamin', 'Mean', 'Std'])
for win in LAD_names:
if win is not 'Center':
df = range_LADs[mark][win]['LAD']
df.columns = cols
dfVal = pd.concat([dfVal,df])
else:
df = range_LADs[mark][win]
df.columns = cols
dfVal = pd.concat([dfVal,df])
# Statistics: mean and std by lamin space
# pd.set_option('display.float_format', lambda x: '%.5f' % x)
with open(os.path.join(stat_path, mark + "_stats.bed"),'w') as f:
for win in LAD_names:
val_by_range = dfVal.loc[dfVal['Lamin'] == intervals[win], 'Val']
mean_by_range = val_by_range.mean()
std_by_range = val_by_range.std()
print(mark, win, "mean: " + str(mean_by_range), "std: " + str(std_by_range))
f.write('{}\t{}\t{}\t{}\n'.format(mark, intervals[win], mean_by_range, std_by_range))
# Sort values and reset the dataframe index
dfVal= dfVal.sort_values(['Chromosome', 'Start', 'End'],
ascending=True).reset_index(drop = True)
# Save statistics
dfVal.to_csv(os.path.join(val_path, mark + "_val.bed"),
header=False, index= False, sep='\t')
import os
import sys
import seaborn as sns
import numpy as np
import pandas as pd
from paths import *
from matplotlib import pyplot as plt
chromatin_category = sys.argv[1]
chromatin_bed = sys.argv[2]
marks_dataset = sys.argv[3]
# Lamin places
windows = ['InsideLAD', '0kb-250kb', '250kb-1000kb',
'1000kb-2500kb', '2500kb-5000kb', 'Center']
marks = os.listdir(os.path.join(selpath, marks_dataset))
chromatin_dir = os.path.join(chromatinpath, chromatin_category, chromatin_bed)
results_path = os.path.join(chromatin_dir, marks_dataset + '_results', 'values')
num_ac = len([m for m in marks if "ac" in m])
num_met = len([m for m in marks if "me" in m])
norm_data = {}
for w in windows:
norm_data[w] = {}
for mark in os.listdir(results_path):
mark_path = os.path.join(results_path, mark)
dfVal = pd.read_csv(mark_path, sep = "\t", header=None)
if chromatin_category == "transcription":
dfVal.columns = ['Chromosomes','Start','End', 'Genes',
'Exp', 'Val', 'Lamin']
else:
dfVal.columns = ['Chromosomes','Start','End',
'Val','Lengths','Lamin']
# Normalization of histone marks number.
# Dvide the number of marks in each bed interval by the sum of all marks in all lamin places
m = mark.rsplit('_', 1)[0]
total = dfVal['Val'].sum()
norm_data[w][m] = dfVal.loc[dfVal['Lamin'] == w, 'Val'].div(total) * 1000
nn = 0
# Plot tendency
plt.figure(figsize=(12, 12))
for num, what in enumerate(['Acethylation', 'Methylation'], 1):
axe = plt.subplot(2,1,num)
ys = []
these_marks = sorted([m for m in marks if what[:2].lower() in m],
key=lambda x: np.mean(norm_data[windows[-1]][x]))
for n, m in enumerate(these_marks):
tendency = np.mean(norm_data[windows[-1]][m]) - np.mean(norm_data[windows[0]][m])
if tendency <= 0:
tendency = 0
plt.plot(range(len(windows)),
[np.mean(norm_data[w][m]) for w in windows], 'o-',
alpha=min(1, 0.1 + 1 * tendency),
color='k', lw=3, zorder=1000)
y = np.mean(norm_data[windows[-1]][m])
if chromatin_category == 'transcription':
y2 = 0.05 + 0.8 * n / len(these_marks)
plt.ylim(0, 0.8)
else:
y2 = 0.05 + 7 * n / len(these_marks)
#plt.ylim(0, 6)
plt.text(len(windows) - 0.4, y2, m, va='center')
plt.plot([len(windows) - 0.9, len(windows) - 0.4], [y, y2], 'k:', lw=1)
_ = plt.xticks(range(len(windows)), windows)
ylim = plt.ylim()
plt.plot([0, len(windows) - 1], [0] * 2, 'k-')
plt.xlim(plt.xlim()[0], len(windows) + 2)
plt.title(what)
plt.ylabel('Normalized histone marks per CAGE-seq read')
axe.spines['right'].set_visible(False)
axe.spines['top'].set_visible(False)
axe.spines['bottom'].set_visible(False)
plot_path = os.path.join(chromatin_dir, marks_dataset + '_results', 'plot')
if not os.path.exists(plot_path):
os.makedirs(plot_path)
print("Generating plot...")
plt.savefig(os.path.join(plot_path, 'normalized_marks.png'))
print("Process finished")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment