Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Computational Biology
HisTeriA
Commits
cd50aa8c
Commit
cd50aa8c
authored
Jun 05, 2021
by
María Morales Martínez
Browse files
remove previous version
parent
abfc068f
Changes
3
Hide whitespace changes
Inline
Side-by-side
pipelines/prueba.py
deleted
100644 → 0
View file @
abfc068f
# Change Hi-C ranges
text
=
"hic_order = "
# if any line contains this text, I want to modify the whole line.
new_text
=
"hic_order = "
+
"dddddddd"
pathRanges
=
"/home/bscuser/3DMetabolism/pipelines/ranges.py"
with
open
(
pathRanges
,
"r"
)
as
f
:
for
line
in
f
.
readlines
():
if
text
in
line
:
line
=
new_text
print
(
line
)
f
.
close
()
pipelines/statistics.py
deleted
100644 → 0
View file @
abfc068f
import
os
import
sys
import
time
import
datetime
import
pandas
as
pd
import
numpy
as
np
import
pyranges
as
pr
from
paths
import
*
chromatin_category
=
sys
.
argv
[
1
]
chromatin_bed
=
sys
.
argv
[
2
]
marks_dataset
=
sys
.
argv
[
3
]
# lamin places
intervals
=
{
'InsideLAD'
:
'InsideLAD'
,
'250kb'
:
'0kb-250kb'
,
'1000kb'
:
'250kb-1000kb'
,
'2500kb'
:
'1000kb-2500kb'
,
'5000kb'
:
'2500kb-5000kb'
,
'Center'
:
'Center'
}
LAD_names
=
list
(
intervals
.
keys
())
# Normalization
def
div_to_val
(
df
,
**
kwargs
):
col
=
kwargs
[
'divisor'
]
df
.
loc
[:,
'Val'
]
=
df
[
'Val'
].
div
(
df
[
col
],
axis
=
0
)
return
df
# Get marks path
marcas
=
{}
for
rep
in
os
.
listdir
(
os
.
path
.
join
(
selpath
,
marks_dataset
)):
marcas
[
rep
]
=
[]
for
fnam
in
os
.
listdir
(
os
.
path
.
join
(
selpath
,
marks_dataset
,
rep
)):
if
not
fnam
.
startswith
(
'H'
):
continue
marcas
[
rep
].
append
(
os
.
path
.
join
(
selpath
,
marks_dataset
,
rep
,
fnam
))
# Open chromatin bed:
chromatin_dir
=
os
.
path
.
join
(
chromatinpath
,
chromatin_category
,
chromatin_bed
)
for
fnam
in
os
.
listdir
(
chromatin_dir
):
if
fnam
.
endswith
(
'_clean.bed'
):
file
=
os
.
path
.
join
(
chromatin_dir
,
fnam
)
chromatin_gr
=
pr
.
read_bed
(
file
)
# Get ovelapped intervals and normalization by ranges or distance
val_marks
=
{}
for
mark
in
marcas
.
keys
():
for
fnam
in
marcas
[
mark
]:
mark_df
=
pr
.
read_bed
(
fnam
)
gr
=
chromatin_gr
.
count_overlaps
(
mark_df
,
overlap_col
=
"Val"
)
if
chromatin_category
==
"transcription"
:
val_marks
[
mark
]
=
gr
.
apply
(
div_to_val
,
divisor
=
'Score'
)
cols
=
[
'Chromosome'
,
'Start'
,
'End'
,
'Genes'
,
'Exp'
,
'Val'
,
'Lamin'
]
else
:
gr
.
Lengths
=
gr
.
lengths
()
val_marks
[
mark
]
=
gr
.
apply
(
div_to_val
,
divisor
=
'Lengths'
)
cols
=
[
'Chromosome'
,
'Start'
,
'End'
,
'Val'
,
'Lengths'
,
'Lamin'
]
# This block is dedicated to the construction of a new chromatin bed with all their intervals
# classified according to the place occupated in chromatin respectively to Lamin B1 and the numbers
# of histone marks loceted in them for each mark.
range_LADs
=
{}
for
mark
in
marcas
.
keys
():
range_LADs
[
mark
]
=
{
i
:
[]
for
i
in
LAD_names
}
for
ind
,
win
in
enumerate
(
LAD_names
):
results
=
os
.
path
.
join
(
chromatin_dir
,
marks_dataset
+
"_results"
)
val_path
=
os
.
path
.
join
(
results
,
'values'
)
stat_path
=
os
.
path
.
join
(
results
,
'statistics'
)
# Create Results folders
if
not
os
.
path
.
exists
(
results
):
for
dir
in
[
results
,
val_path
,
stat_path
]:
os
.
makedirs
(
dir
)
# Center is the last step of the analysis
if
win
is
not
'Center'
:
fnam_LAD
=
os
.
path
.
join
(
lamin_b1
,
'window'
,
"Lamin_B1."
+
win
+
".bed"
)
LAD_df
=
pr
.
read_bed
(
fnam_LAD
)
range_LADs
[
mark
][
win
]
=
{
'LAD'
:[],
'noLAD'
:[]}
# Marks which ovelapps current chromatin region in each space determinated by the distance to Lamin
## Initial stating range
if
win
is
'InsideLAD'
:
print
(
"mark: "
+
mark
+
" LAD range: "
+
win
)
gr
=
val_marks
[
mark
].
count_overlaps
(
LAD_df
,
overlap_col
=
"Lamin_count"
)
df
=
gr
.
df
df
[
'Lamin'
]
=
"noLAD"
## Nexts ranges towards the center
else
:
print
(
"mark: "
+
mark
+
" LAD range: "
+
win
)
gr
=
pr
.
PyRanges
(
range_LADs
[
mark
][
LAD_names
[
ind
-
1
]][
'noLAD'
])
gr
=
gr
.
count_overlaps
(
LAD_df
,
overlap_col
=
"Lamin_count"
)
df
=
gr
.
df
# Filtering those chromatin region whitn al least one mark in the region that we are checking
df
.
loc
[
df
[
'Lamin_count'
]
>
0
,
'Lamin'
]
=
intervals
[
win
]
df
=
df
.
drop
(
'Lamin_count'
,
axis
=
1
)
# Reset index of dataframe
noLAD
=
df
.
loc
[
df
[
'Lamin'
]
==
"noLAD"
].
reset_index
(
drop
=
True
)
LAD
=
df
.
loc
[
df
[
'Lamin'
]
==
intervals
[
win
]].
reset_index
(
drop
=
True
)
# Save the subdataframe for each lamin place and the rest of intervals to be re-checked
# for other lamin places
range_LADs
[
mark
][
win
][
'noLAD'
]
=
noLAD
range_LADs
[
mark
][
win
][
'LAD'
]
=
LAD
else
:
print
(
"mark: "
+
mark
+
" LAD range: "
+
win
)
df
=
range_LADs
[
mark
][
LAD_names
[
ind
-
1
]][
'noLAD'
]
df
[
'Lamin'
]
=
win
range_LADs
[
mark
][
win
]
=
df
# Concat all subdataframes in a unique datrame with al the classfied chromatin regions
for
mark
in
marcas
.
keys
():
if
chromatin_category
==
'transcription'
:
dfVal
=
pd
.
DataFrame
(
columns
=
[
'Chromosome'
,
'Start'
,
'End'
,
'Genes'
,
'Exp'
,
'Val'
,
'Lamin'
])
else
:
dfVal
=
pd
.
DataFrame
(
columns
=
[
'Chromosome'
,
'Start'
,
'End'
,
'Val'
,
'Lengths'
,
'Lamin'
])
dfStat
=
pd
.
DataFrame
(
columns
=
[
'Mark'
,
'Lamin'
,
'Mean'
,
'Std'
])
for
win
in
LAD_names
:
if
win
is
not
'Center'
:
df
=
range_LADs
[
mark
][
win
][
'LAD'
]
df
.
columns
=
cols
dfVal
=
pd
.
concat
([
dfVal
,
df
])
else
:
df
=
range_LADs
[
mark
][
win
]
df
.
columns
=
cols
dfVal
=
pd
.
concat
([
dfVal
,
df
])
# Statistics: mean and std by lamin space
# pd.set_option('display.float_format', lambda x: '%.5f' % x)
with
open
(
os
.
path
.
join
(
stat_path
,
mark
+
"_stats.bed"
),
'w'
)
as
f
:
for
win
in
LAD_names
:
val_by_range
=
dfVal
.
loc
[
dfVal
[
'Lamin'
]
==
intervals
[
win
],
'Val'
]
mean_by_range
=
val_by_range
.
mean
()
std_by_range
=
val_by_range
.
std
()
print
(
mark
,
win
,
"mean: "
+
str
(
mean_by_range
),
"std: "
+
str
(
std_by_range
))
f
.
write
(
'{}
\t
{}
\t
{}
\t
{}
\n
'
.
format
(
mark
,
intervals
[
win
],
mean_by_range
,
std_by_range
))
# Sort values and reset the dataframe index
dfVal
=
dfVal
.
sort_values
([
'Chromosome'
,
'Start'
,
'End'
],
ascending
=
True
).
reset_index
(
drop
=
True
)
# Save statistics
dfVal
.
to_csv
(
os
.
path
.
join
(
val_path
,
mark
+
"_val.bed"
),
header
=
False
,
index
=
False
,
sep
=
'
\t
'
)
pipelines/tendency.py
deleted
100644 → 0
View file @
abfc068f
import
os
import
sys
import
seaborn
as
sns
import
numpy
as
np
import
pandas
as
pd
from
paths
import
*
from
matplotlib
import
pyplot
as
plt
chromatin_category
=
sys
.
argv
[
1
]
chromatin_bed
=
sys
.
argv
[
2
]
marks_dataset
=
sys
.
argv
[
3
]
# Lamin places
windows
=
[
'InsideLAD'
,
'0kb-250kb'
,
'250kb-1000kb'
,
'1000kb-2500kb'
,
'2500kb-5000kb'
,
'Center'
]
marks
=
os
.
listdir
(
os
.
path
.
join
(
selpath
,
marks_dataset
))
chromatin_dir
=
os
.
path
.
join
(
chromatinpath
,
chromatin_category
,
chromatin_bed
)
results_path
=
os
.
path
.
join
(
chromatin_dir
,
marks_dataset
+
'_results'
,
'values'
)
num_ac
=
len
([
m
for
m
in
marks
if
"ac"
in
m
])
num_met
=
len
([
m
for
m
in
marks
if
"me"
in
m
])
norm_data
=
{}
for
w
in
windows
:
norm_data
[
w
]
=
{}
for
mark
in
os
.
listdir
(
results_path
):
mark_path
=
os
.
path
.
join
(
results_path
,
mark
)
dfVal
=
pd
.
read_csv
(
mark_path
,
sep
=
"
\t
"
,
header
=
None
)
if
chromatin_category
==
"transcription"
:
dfVal
.
columns
=
[
'Chromosomes'
,
'Start'
,
'End'
,
'Genes'
,
'Exp'
,
'Val'
,
'Lamin'
]
else
:
dfVal
.
columns
=
[
'Chromosomes'
,
'Start'
,
'End'
,
'Val'
,
'Lengths'
,
'Lamin'
]
# Normalization of histone marks number.
# Dvide the number of marks in each bed interval by the sum of all marks in all lamin places
m
=
mark
.
rsplit
(
'_'
,
1
)[
0
]
total
=
dfVal
[
'Val'
].
sum
()
norm_data
[
w
][
m
]
=
dfVal
.
loc
[
dfVal
[
'Lamin'
]
==
w
,
'Val'
].
div
(
total
)
*
1000
nn
=
0
# Plot tendency
plt
.
figure
(
figsize
=
(
12
,
12
))
for
num
,
what
in
enumerate
([
'Acethylation'
,
'Methylation'
],
1
):
axe
=
plt
.
subplot
(
2
,
1
,
num
)
ys
=
[]
these_marks
=
sorted
([
m
for
m
in
marks
if
what
[:
2
].
lower
()
in
m
],
key
=
lambda
x
:
np
.
mean
(
norm_data
[
windows
[
-
1
]][
x
]))
for
n
,
m
in
enumerate
(
these_marks
):
tendency
=
np
.
mean
(
norm_data
[
windows
[
-
1
]][
m
])
-
np
.
mean
(
norm_data
[
windows
[
0
]][
m
])
if
tendency
<=
0
:
tendency
=
0
plt
.
plot
(
range
(
len
(
windows
)),
[
np
.
mean
(
norm_data
[
w
][
m
])
for
w
in
windows
],
'o-'
,
alpha
=
min
(
1
,
0.1
+
1
*
tendency
),
color
=
'k'
,
lw
=
3
,
zorder
=
1000
)
y
=
np
.
mean
(
norm_data
[
windows
[
-
1
]][
m
])
if
chromatin_category
==
'transcription'
:
y2
=
0.05
+
0.8
*
n
/
len
(
these_marks
)
plt
.
ylim
(
0
,
0.8
)
else
:
y2
=
0.05
+
7
*
n
/
len
(
these_marks
)
#plt.ylim(0, 6)
plt
.
text
(
len
(
windows
)
-
0.4
,
y2
,
m
,
va
=
'center'
)
plt
.
plot
([
len
(
windows
)
-
0.9
,
len
(
windows
)
-
0.4
],
[
y
,
y2
],
'k:'
,
lw
=
1
)
_
=
plt
.
xticks
(
range
(
len
(
windows
)),
windows
)
ylim
=
plt
.
ylim
()
plt
.
plot
([
0
,
len
(
windows
)
-
1
],
[
0
]
*
2
,
'k-'
)
plt
.
xlim
(
plt
.
xlim
()[
0
],
len
(
windows
)
+
2
)
plt
.
title
(
what
)
plt
.
ylabel
(
'Normalized histone marks per CAGE-seq read'
)
axe
.
spines
[
'right'
].
set_visible
(
False
)
axe
.
spines
[
'top'
].
set_visible
(
False
)
axe
.
spines
[
'bottom'
].
set_visible
(
False
)
plot_path
=
os
.
path
.
join
(
chromatin_dir
,
marks_dataset
+
'_results'
,
'plot'
)
if
not
os
.
path
.
exists
(
plot_path
):
os
.
makedirs
(
plot_path
)
print
(
"Generating plot..."
)
plt
.
savefig
(
os
.
path
.
join
(
plot_path
,
'normalized_marks.png'
))
print
(
"Process finished"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment