Commit 4f5eba5b authored by Esther Vendrell Miras's avatar Esther Vendrell Miras

JSON templates dir

parent d83e14aa
Pipeline #11004 passed with stages
in 38 seconds
.nextflow*
builds/
work/
sample_out/
{
"$id": "https://github.com/inab/benchmarking/minimal-json-schemas",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Minimal dataset concept (ELIXIR-EXCELERATE WP2 Benchmarking schemas)",
"type": "object",
"properties": {
"_id": {
"title": "The unique community-set id of the dataset",
"type": "string"
},
"_schema": {
"title": "The JSON schema absolute URI. It must match the value of 'id' in the JSON schema",
"type": "string",
"format": "uri",
"enum": [ "https://github.com/inab/benchmarking/minimal-json-schemas" ]
},
"community_id": {
"title": "The community where this dataset was generated or used",
"description": "The name or OEB official id for the community which used or generated the dataset",
"type": "string",
"foreign_keys": [
{
"schema_id": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Community",
"members": [ "." ]
}
],
"minLength": 1
},
"challenge_id": {
"title": "The challenge(s) where this dataset was generated or used",
"oneOf": [
{
"type": "array",
"minItems": 1,
"items": {
"type": "string",
"foreign_keys": [
{
"schema_id": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Challenge",
"members": [
"."
]
}
],
"minLength": 1
}
},
{
"type": "string",
"foreign_keys": [
{
"schema_id": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Challenge",
"members": [ "." ]
}
],
"minLength": 1
}
]
},
"type": {
"title": "Type of dataset.",
"type": "string",
"mingLength": 1,
"enum": [ "participant", "assessment" ]
},
"datalink": {
"title": "The data itself",
"type": "object",
"properties": {
"validation_date": {
"title": "Last validation date of the datalink",
"type": "string",
"format": "date-time"
},
"status": {
"title": "Last validation status (unreachable content, missing content(404), interrupted while validating, corrupted (checksums, not all the contents available, etc...), ok)",
"type": "string",
"enum": [ "unreachable", "missing", "interrupted", "corrupted", "ok" ]
},
"attrs": {
"title": "Attributes describing the datalink: whether it is inline (i.e. a data URI); a CURIE; whether it is compressed, or an archive (tar, zip, rar), a metalink (RFC 5854, RFC 6249, https://en.wikipedia.org/wiki/Metalink) or a standard sums (MD5, SHA1) file",
"type": "array",
"minItems": 1,
"uniqueItems": true,
"anyOf": [
{
"items": {
"type": "string",
"enum": [ "inline","compressed", "archive", "metalink", "sums" ]
}
},
{
"items": {
"type": "string",
"enum": [ "curie", "compressed", "archive", "metalink" ]
}
}
]
}
},
"additionalProperties": false,
"required": [ "validation_date", "status" ]
},
"metrics": {
"title": "The computed metrics stored as inline data",
"type": "object",
"properties": {
"metric_id":{
"title": "the OEB official id / name of the metric",
"type":"string",
"foreign_keys": [
{
"schema_id": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Metrics",
"members": [ "." ]
}
],
"minLength": 1
},
"value": {
"title": "the numerical value of the metric",
"type":"number",
"minLength":1
},
"stderr": {
"title": "the standard error / deviation for the metric (if exists)",
"type":"number",
"minLength":1
}
},
"required": ["metric_id", "value"]
},
"participant_id": {
"title": "The id / name of the tool which generated this dataset",
"type": "string",
"foreign_keys": [
{
"schema_id": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Tool",
"members": [ "." ]
}
],
"minLength": 1
}
},
"additionalProperties": false,
"required": ["_id","community_id","challenge_id", "type","participant_id"],
"primary_key": ["_id"],
"dependencies": {}
}
from datetime import datetime
import os
import json
import jsonschema
import sys
"""
INFO:
This module contains functions that generate JSON objects with structure compatible with the Elixir
Benchmarking Data Model (https://github.com/inab/benchmarking-data-model)
It should be used in the docker declarations to generate the output files in any benchmarking workflow which might be implemented in the
OpenEBench infrastructure.
Benchmarking workflows architecture can be found in https://github.com/inab/TCGA_benchmarking_workflow
Docker declarations for each step: https://github.com/inab/TCGA_benchmarking_dockers
"""
##############################################################################################################################################
##############################################################################################################################################
"""
Participant datasets should be generated in the VALIDATION step
The minimal required properties for this dataset are:
- ID - the id assigned to this dataset by the community
- community - the benchmarking community name/OEB-id
- challenges - an array with one or more challenges where the participant is evaluated
- participant_name - name/OEB-id of the tool which generated the dataset
- validated(boolean) - whether this file passed the validation script or not
"""
def write_participant_dataset( ID, community, challenges, participant_name, validated):
if validated == True:
status = "ok"
else:
status = "corrupted"
data = {
"_id": ID,
"community_id": community,
"challenge_id": challenges,
"type": "participant",
"datalink": {
"attrs": ["archive"],
"validation_date": str(datetime.now().replace(microsecond=0).isoformat()),
"status": status
},
"participant_id": participant_name,
}
# validate the generated object with the minimal JSON schema
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Benchmarking_minimal_datasets_schema.json'), 'r') as f:
schema = json.load(f)
try:
jsonschema.validate(data, schema)
return data
except jsonschema.exceptions.ValidationError as ve:
sys.stderr.write("ERROR: JSON schema validation failed. Output json file does not have the correct format:\n" + str(ve) + "\n")
"""
Assessment datasets should be generated in the METRICS COMPUTATION step
The minimal required properties for this dataset are:
- ID - the id assigned to this dataset by the community
- community - the benchmarking community name/OEB-id
- challenge - the challenge where the metrics were computed
- participant_name - name/OEB-id of the tool which is evaluated in this assessment
- metric - the name of the unique metric which correspond to this assessment
- metric_value - the numeric value of the metric
- error - the standard error/deviation for the computed metric (can be 0)
"""
def write_assessment_dataset( ID, community, challenge, participant_name, metric, metric_value, error):
data = {
"_id": ID,
"community_id": community,
"challenge_id": challenge,
"type": "assessment",
"metrics": {"metric_id": metric,
"value": float(metric_value),
"stderr": error
},
"participant_id": participant_name
}
# validate the generated object with the minimal JSON schema
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),'Benchmarking_minimal_datasets_schema.json'), 'r') as f:
schema = json.load(f)
try:
jsonschema.validate(data, schema)
return data
except jsonschema.exceptions.ValidationError as ve:
sys.stderr.write(
"ERROR: JSON schema validation failed. Output json file does not have the correct format:\n" + str(
ve) + "\n")
\ No newline at end of file
......@@ -10,7 +10,7 @@
"archive"
],
"status": "ok",
"validation_date": "2020-08-26T15:06:09"
"validation_date": "2020-09-04T09:16:03"
},
"participant_id": "my_gene_predictor",
"type": "participant"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment