Commit 82cafa7d authored by jcorvi's avatar jcorvi

version 1.1 several improvement over the annotation process with cdisc

send and etox terminology
parent 476928d2
Pipeline #2796 canceled with stage
in 7 minutes and 44 seconds
......@@ -3,3 +3,13 @@
## Version 1.0, 2020-03-03
First version of the component.
## Version 1.1, 2020-03-11
Improve of several dictionaries and rules:
- Adding the Study Domain as a Feature to the Study Test in the ETOX and CDISC SEND Terminology.
- Adding the Study Domain as a Feature to the Finding in ETOX and CDISC SEND Terminology.
- LBTEST and PKPARM correct mapping to the LB and PP domain.
- Stop words for finding and for anatomy.
- Mapping of ETOX ILO to Specics domains.
\ No newline at end of file
......@@ -5,18 +5,26 @@ This component annotated text using CDISC SEND and eTOX (OntoBrowser) terminolog
## Description
This component annotated several entities related to the treatment-related findings:
FINDINGS.
STUDY_TESTCDS.
SPECIMEN.
FINDINGS
STUDY_TESTCDS
SPECIMEN
ROUTE_OF_ADMINISTRATION
SEX
CDISC SEND controlled terminology: CDISC maintain and develop the official SEND terminology, available at: https://evs.nci.nih.gov/ftp1/CDISC/SEND/.
ETOX terminology: Information available from the Ontobrowser system was used to increase the terminology. The primary objective of these system was to provide an online collaborative solution for expert curators to map report terms (from the eTOX database) to preferred ontology (or controlled terminology) terms.
For each tagged entity the controlled terminology code is add as a features in case of CDISC SEND controller terms; also when a term and belongs to the eTOX (Ontobrowser terminology) a reference identificator is adde as a feature.
For the fields FINDING and STUDY_TESTCD a STUDY_DOMAIN feature is added to describe the context of the study.
Internally, the cdisc-etox-annotation library uses the generic nlp-gate-generic-component https://gitlab.bsc.es/inb/text-mining/generic-tools/nlp-gate-generic-component. This library is a generic component that annotate text with parametrices GATE-formatted gazetters/dictionaries. In other words, the cdisc-etox-annotation library is an instance of the nlp-gate-generic-component with a specific set of dictionaries.
## Actual Version: 1.0, 2020-03-04
## Actual Version: 1.1, 2020-03-11
## [Changelog](https://gitlab.bsc.es/inb/text-mining/bio-tools/cdisc-etox-annotation/blob/master/CHANGELOG)
## Docker
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,4 +2,6 @@ etox_in-life-observations_dict.lst:ETOX_ILO:ETOX_ILO
etox_anatomy_dict.lst:ANATOMY_ETOX:ANATOMY_ETOX
etox_send_dict.lst:SEND_ETOX:SEND_ETOX
etox_moa_dict.lst:MOA_ETOX:MOA_ETOX
cdisc_send_dict.lst:SEND_CIDSC:SEND_CIDSC
\ No newline at end of file
cdisc_send_dict.lst:SEND_CIDSC:SEND_CIDSC
stop_words_finding.lst:STOP_WORD_FINDING:STOP_WORD_FINDING
stop_words_anatomy.lst:STOP_WORD_ANATOMY:STOP_WORD_ANATOMY
\ No newline at end of file
liver weight
\ No newline at end of file
administration/collection site
animal identification
anus
body temperature
bodyweight/growth
breathing
digit/claw
dosing
ear
eye
feces/urine
urine
food consumption
general behaviour
general condition
locomotive behaviour
mouth
normal
nouse
posture
pulmonary parameter
skin/fur
tail
teeth
tongue
unclassified
varia
unspecified
\ No newline at end of file
......@@ -21,8 +21,7 @@ Rule: cdisc_anatomical_location_mapping
gate.FeatureMap features = Factory.newFeatureMap();
features.put("text",content);
features.put("SOURCE","CDISC");
features.put("RULE","DISC_ANATOMICAL_LOCATION_MAPPING");
features.put("study_domain", label);
features.put("RULE","CDISC_ANATOMICAL_LOCATION_MAPPING");
features.putAll(lookupFeatures);
features.remove("majorType");
features.remove("minorType");
......
......@@ -17,7 +17,7 @@ Rule: cdisc_finding_mapping
FeatureMap lookupFeatures = ann.getFeatures();
String label = lookupFeatures.get("LABEL").toString();
String content = stringFor(doc, ann);
if(label.contains("FXFINDRS") || label.contains("NONNEO") || label.contains("NEOPLASM") || label.contains("NEOPLASTIC FINDING TYPE") || label.contains("CSTATE") || label.contains("BODSYS")){
if(label.contains("FXFINDRS") || label.contains("NONNEO") || label.contains("NEOPLASM") || label.contains("CSTATE") ){ // || label.contains("BODSYS")
gate.FeatureMap features = Factory.newFeatureMap();
features.put("text",content);
features.put("SOURCE","CDISC");
......@@ -27,7 +27,22 @@ Rule: cdisc_finding_mapping
features.remove("minorType");
features.remove("INTERNAL_CODE");
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "FINDING", features);
String study_domain = "";
if(label.contains("FXFINDRS")){
study_domain = "FM";
}else if(label.contains("NONNEO")){
study_domain = "MI";
}else if(label.contains("NEOPLASM")){
study_domain = "TF";
}else if(label.contains("CSTATE")){
study_domain = "CL";
}else {
System.out.println("REVISAR label " + label + " for this term x: " + content);
study_domain = "";
}
features.put("CDIS_SEND_DOMAIN_CODE", study_domain);
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "FINDING", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
......
......@@ -22,7 +22,6 @@ Rule: cdisc_spec_mapping
features.put("text",content);
features.put("SOURCE","CDISC");
features.put("RULE","CDISC_SPEC_RULE");
features.put("study_domain", label);
features.putAll(lookupFeatures);
features.remove("majorType");
features.remove("minorType");
......
......@@ -17,7 +17,7 @@ Rule: cdisc_study_testcd_mapping
FeatureMap lookupFeatures = ann.getFeatures();
String label = lookupFeatures.get("LABEL").toString();
String content = stringFor(doc, ann);
if(label.contains("PKPARMCD_") || label.contains("PKPARM_") || label.endsWith("TEST NAME") || label.endsWith("TEST CODE")){
if(label.contains("PKPARMCD_") || label.contains("PKPARM_") || label.endsWith("TEST CODE") && !label.contains("STSPRM")){
gate.FeatureMap features = Factory.newFeatureMap();
features.put("text",content);
features.put("SOURCE","CDISC");
......@@ -26,46 +26,51 @@ Rule: cdisc_study_testcd_mapping
features.remove("majorType");
features.remove("minorType");
features.remove("INTERNAL_CODE");
if(label.endsWith("TEST CODE")) {
if(!label.contains("STSPRM")){
try{
try{
String study_domain = "";
if(label.contains("BGTEST")){
study_domain = "BG";
}else if(label.contains("BWTEST")){
study_domain = "BW";
}else if(label.contains("DDTEST")){
study_domain = "DD";
}else if(label.contains("EGTEST")){
study_domain = "EG";
}else if(label.contains("FMTEST")){
study_domain = "FM";
}else if(label.contains("FWTESTCD")){
study_domain = "FW";
}else if(label.contains("MATEST")){
study_domain = "MA";
}else if(label.contains("OMTEST")){
study_domain = "OM";
}else if(label.contains("PYTEST")){
study_domain = "PY";
}else if(label.contains("SCVTST")){
study_domain = "CV";
}else if(label.contains("MITEST")){
study_domain = "MI";
}else if(label.contains("SRETST")){
study_domain = "RE";
}else if(label.contains("TFTEST")){
study_domain = "TF";
}else if(label.contains("VSTEST")){
study_domain = "VS";
}else if(label.contains("SVSTST")){
study_domain = "VS";
}else if(label.contains("LBTEST")){
study_domain = "LB";
}else if(label.contains("PKPARM")){
study_domain = "PP";
}else {
System.out.println("REVISAR label " + label + " for this term x: " + content);
study_domain = "";
}
features.put("CDIS_SEND_DOMAIN_CODE", study_domain);
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_TESTCD", features);
if(
label.contains("BGTEST") ||
label.contains("BWTEST") ||
label.contains("DDTEST") ||
label.contains("EGTEST") ||
label.contains("FMTEST") ||
label.contains("FXTEST") ||
label.contains("FWTEST") ||
label.contains("BWTEST") ||
label.contains("MATEST") ||
label.contains("OMTEST") ||
label.contains("PYTEST") ||
label.contains("SCVTST") ||
label.contains("MITEST") ||
label.contains("SRETST") ||
label.contains("TFTEST") ||
label.contains("VSTEST")
)
{
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_DOMAIN", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
}
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
}
}else {
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_TESTCD", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
}
//remove old lookup
inputAS.remove(ann);
}else{
......
Imports: {
import static gate.Utils.*;
}
Phase:secondphase
Input: Lookup
Options: control = appelt
Rule: removeStopwords
(
{Lookup.majorType == "STOP_WORD_ANATOMY"}
) :stop
-->
{
System.out.println("ENTER RULE STOP ANATOMY");
gate.AnnotationSet lookup = (gate.AnnotationSet) bindings.get("stop");
gate.Annotation ann = (gate.Annotation) lookup.iterator().next();
gate.AnnotationSet to_remove = outputAS.get("SPECIMEN", ann.getStartNode().getOffset(), ann.getEndNode().getOffset());
for (Annotation rem : to_remove) {
if(ann.getStartNode().getOffset()==rem.getStartNode().getOffset() && ann.getEndNode().getOffset()==rem.getEndNode().getOffset()){
System.out.println(rem.getType() + " : " + stringFor(doc, rem));
outputAS.remove(rem);
}
}
}
\ No newline at end of file
......@@ -17,6 +17,7 @@ Rule: etox_ilo_domain_mapping
FeatureMap lookupFeatures = ann.getFeatures();
String label = lookupFeatures.get("LABEL").toString();
String content = stringFor(doc, ann);
if(label.contains("_DOMAIN")){
gate.FeatureMap features = Factory.newFeatureMap();
lookupFeatures.remove("majorType");
......@@ -28,14 +29,13 @@ Rule: etox_ilo_domain_mapping
features.put("RULE","ETOX_ILO_DOMAIN_MAPPING");
features.putAll(lookupFeatures);
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_DOMAIN", features);
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "FINDING", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
//remove old lookup
inputAS.remove(ann);
}else if(label.contains("IN_LIFE_OBSERVATION")){
}else if(label.contains("IN_LIFE_OBSERVATION") | label.contains("ANIMAL_IDENTIFICATION_FINDING") | label.contains("DOSING_FINDING")){
gate.FeatureMap features = Factory.newFeatureMap();
lookupFeatures.remove("majorType");
lookupFeatures.remove("minorType");
......@@ -46,7 +46,7 @@ Rule: etox_ilo_domain_mapping
features.put("RULE","etox_ilo_domain_mapping");
features.putAll(lookupFeatures);
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "FINDING", features);
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), label, features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
......
Imports: {
import static gate.Utils.*;
}
Phase:secondphase
Input: Lookup
Options: control = appelt
Rule: removeStopwords
(
{Lookup.majorType == "STOP_WORD_FINDING"}
) :stop
-->
{
System.out.println("ENTER RULE STOP ILO");
gate.AnnotationSet lookup = (gate.AnnotationSet) bindings.get("stop");
gate.Annotation ann = (gate.Annotation) lookup.iterator().next();
gate.AnnotationSet to_remove = outputAS.get("FINDING", ann.getStartNode().getOffset(), ann.getEndNode().getOffset());
for (Annotation rem : to_remove) {
if(ann.getStartNode().getOffset()==rem.getStartNode().getOffset() && ann.getEndNode().getOffset()==rem.getEndNode().getOffset()){
System.out.println(rem.getType() + " : " + stringFor(doc, rem));
outputAS.remove(rem);
}
}
}
\ No newline at end of file
......@@ -38,6 +38,7 @@ Rule: etox_send_mapping
}else if(label.equals("LBTEST") && content.length() > 3){
features.put("ETOX_ORIGINAL_LABEL", label);
try{
features.put("ETOX_SEND_DOMAIN_CODE","LB");
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_TESTCD", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
......@@ -76,6 +77,7 @@ Rule: etox_send_mapping
} else if(label.equals("PKPARM")){
features.put("ETOX_ORIGINAL_LABEL", label);
try{
features.put("ETOX_SEND_DOMAIN_CODE","PP");
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(), "STUDY_TESTCD", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
......
Imports: {
import static gate.Utils.*;
}
Phase:firstphase
Input: Token
Options: control = appelt
Rule: etox_send_mapping
(
{Token.word=="group"}
)
:lookup
-->
{
gate.AnnotationSet lookup = (gate.AnnotationSet) bindings.get("lookup");
gate.Annotation ann = (gate.Annotation) lookup.iterator().next();
FeatureMap lookupFeatures = ann.getFeatures();
gate.FeatureMap features = Factory.newFeatureMap();
features.putAll(lookupFeatures);
try{
outputAS.add(lookup.firstNode().getOffset(),lookup.lastNode().getOffset(),"GROUP_JAPE", features);
}catch(InvalidOffsetException e){
throw new LuckyException(e);
}
#GROUP RULES
//{ (([{ word:/group[s]*/; tag:/NN.*/}] ([{ner:NUMBER } | {word::IS_NUM} | {word:/^[mdclxvi]+$/}] [{ word:$DOSE_SEP }])* ([{ner:NUMBER } | {word::IS_NUM} | {word:/^[mdclxvi]+$/}]) ) ) => "GROUP" }
//ojo revisar el NN
//{ (( [{tag:/JJ.*|NN.*/} ] [{ word:/group[s]*/; tag:/NN.*/}] ) ) => "GROUP" }
}
\ No newline at end of file
......@@ -2,13 +2,13 @@ MultiPhase: Main
Phases:
etox_ilo_trigger_mapping
etox_ilo_domain_mapping
etox_ilo_remove_stop_words
etox_anatomy_mapping
etox_anatomy_remove_stop_words
etox_send_mapping
etox_moa_mapping
cdisc_study_testcd_mapping
cdisc_study_domain_mapping
cdisc_finding_mapping
cdisc_spec_mapping
cdisc_anatomical_location_mapping
cdisc_route_of_administration_mapping
delete_lookups
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment