Commit ae021be0 authored by jcorvi's avatar jcorvi
Browse files

First Commit

parents
#
# Project specific excludes
#
tomcat
#
# Default excludes
#
# Binaries
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
*.war
*.ear
*.sar
*.class
# Maven
target/
# IntelliJ project files
*.iml
*.iws
*.ipr
.idea/
# eclipse project file
.settings/
.classpath
.project
# NetBeans specific
nbproject/private/
build/
nbbuild/
dist/
nbdist/
nbactions.xml
nb-configuration.xml
# OS
.DS_Store
# Misc
*.swp
release.properties
pom.xml.releaseBackup
pom.xml.tag
#custom
pos/
FROM alpine:3.9
WORKDIR /usr/local/share/ades_relation
ARG ADES_RELATION_VERSION=1.0
COPY docker-build.sh /usr/local/bin/docker-build.sh
COPY src src
COPY pom.xml .
RUN mkdir logs
RUN chmod u=rwx,g=rwx,o=r /usr/local/share/ades_relation -R
RUN docker-build.sh ${ADES_RELATION_VERSION}
ades-relation-extraction
========================
<b>Relation Extraction for Preclinical Adverse Drug Effect text-mining Project</b>
========================
Internal project. Specific for the preclinical adverse Drug effect text-mining pipeline.
It uses the nlp-gate-generic-component; and contains specific jape rules for the relation extraction of the treatment-related findings.
This project it uses gate formated documents as input and output.
The most important information is that this project is only usefull inside the preclinical adverse drug effect pipeline.
The main objective is to extract the relations between the entities and retrieve the treatment-related findings of the toxicology reports.
========================
#!/bin/sh
BASEDIR=/usr/local
ADES_RELATION_EXTRACTION_HOME="${BASEDIR}/share/ades_relation_extraction/"
ADES_RELATION_EXTRACTION_VERSION=1.0
# Exit on error
set -e
if [ $# -ge 1 ] ; then
ADES_RELATION_EXTRACTION_VERSION="$1"
fi
if [ -f /etc/alpine-release ] ; then
# Installing OpenJDK 8
apk add --update openjdk8-jre
# dict tagger development dependencies
apk add openjdk8 git maven
else
# Runtime dependencies
apt-get update
apt-get install openjdk-8-jre
# The development dependencies
apt-get install openjdk-8-jdk git maven
fi
git clone --depth 1 https://github.com/inab/docker-textmining-tools.git nlp_gate_generic_component
cd nlp_gate_generic_component
git filter-branch --prune-empty --subdirectory-filter nlp-gate-generic-component HEAD
mvn clean install -DskipTests
cd ..
#rename jar
mv nlp_gate_generic_component/target/nlp-gate-generic-component-0.0.1-SNAPSHOT-jar-with-dependencies.jar nlp-gate-generic-component-${ADES_RELATION_EXTRACTION_VERSION}.jar
cat > /usr/local/bin/ades-relation-extraction <<EOF
#!/bin/sh
exec java \$JAVA_OPTS -jar "${ADES_RELATION_EXTRACTION_HOME}/nlp-gate-generic-component-${ADES_RELATION_EXTRACTION_VERSION}.jar" -workdir "${ADES_RELATION_EXTRACTION_HOME}" -j jape_rules/main.jape "\$@"
EOF
chmod +x /usr/local/bin/ades-relation-extraction
#delete target, do not delete for now because it has the jape rules inside
#rm -R nlp_generic_annotation
#add bash for nextflow
apk add bash
if [ -f /etc/alpine-release ] ; then
# Removing not needed tools
apk del openjdk8 git maven
rm -rf /var/cache/apk/*
else
apt-get remove openjdk-8-jdk git maven
rm -rf /var/cache/dpkg
fi
#!/bin/sh
BASEDIR=/usr/local
ADES_RELATION_HOME="${BASEDIR}/share/ades_relation/"
ADES_RELATION_VERSION=1.0
# Exit on error
set -e
if [ $# -ge 1 ] ; then
ADES_RELATION_VERSION="$1"
fi
if [ -f /etc/alpine-release ] ; then
# Installing OpenJDK 8
apk add --update openjdk8-jre
# dict tagger development dependencies
apk add openjdk8 git maven
else
# Runtime dependencies
apt-get update
apt-get install openjdk-8-jre
# The development dependencies
apt-get install openjdk-8-jdk git maven
fi
mvn clean install -DskipTests
#rename jar
mv target/ades-relation-extraction-0.0.1-SNAPSHOT-jar-with-dependencies.jar ades-relation-extraction-${ADES_RELATION_VERSION}.jar
cat > /usr/local/bin/ades-relation-extraction <<EOF
#!/bin/sh
exec java \$JAVA_OPTS -jar "${ADES_RELATION_HOME}/ades-relation-extraction-${ADES_RELATION_VERSION}.jar" -workdir "${ADES_RELATION_HOME}" "\$@"
EOF
chmod +x /usr/local/bin/ades-relation-extraction
#delete target
rm -R target src pom.xml
#add bash for nextflow
apk add bash
if [ -f /etc/alpine-release ] ; then
# Removing not needed tools
apk del openjdk8 git maven
rm -rf /var/cache/apk/*
else
apt-get remove openjdk-8-jdk git maven
rm -rf /var/cache/dpkg
fi
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>es.bsc.inb.nlp</groupId>
<artifactId>ades-relation-extraction</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>ades-relation-extraction</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-core</artifactId>
<version>8.5.1</version>
</dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>
es.bsc.inb.ades.relation.extraction.main.App
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package es.bsc.inb.ades.relation.extraction.main;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import com.fasterxml.jackson.core.JsonGenerationException;
import es.bsc.inb.ades.relation.extraction.model.Finding;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.creole.ResourceInstantiationException;
import gate.relations.RelationSet;
import gate.util.GateException;
import gate.util.InvalidOffsetException;
/**
* ADES Export to JSON.
*
* Export from GATE format to JSON.
*
*
* @author jcorvi
*
*/
public class App {
static final String template_value_name = "value";
public static void main(String[] args ){
Options options = new Options();
Option input = new Option("i", "input", true, "input directory path");
input.setRequired(true);
options.addOption(input);
Option output = new Option("o", "output", true, "output directory path");
output.setRequired(true);
options.addOption(output);
Option set = new Option("a", "annotation_set", true, "Annotation set where the annotation will be included");
set.setRequired(true);
options.addOption(set);
Option annotation_set_relation_extraction = new Option("ar", "annotation_set_relation_extraction", true, "Annotation set where the relation extraction will be included");
annotation_set_relation_extraction.setRequired(true);
options.addOption(annotation_set_relation_extraction);
Option workdir = new Option("workdir", "workdir", true, "workDir directory path");
workdir.setRequired(false);
options.addOption(workdir);
CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp("utility-name", options);
System.exit(1);
}
String inputFilePath = cmd.getOptionValue("input");
String outputFilePath = cmd.getOptionValue("output");
String workdirPath = cmd.getOptionValue("workdir");
String annotationSet = cmd.getOptionValue("annotation_set");
String annotationSetRelationExtraction = cmd.getOptionValue("annotation_set_relation_extraction");
if (!java.nio.file.Files.isDirectory(Paths.get(inputFilePath))) {
System.out.println("Please set the inputDirectoryPath ");
System.exit(1);
}
if (annotationSet==null) {
System.out.println("Please set the annotation set where the annotation will be included");
System.exit(1);
}
if (annotationSetRelationExtraction==null) {
System.out.println("Please set the annotation relation extraction output set where the relations will be included");
System.exit(1);
}
File outputDirectory = new File(outputFilePath);
if(!outputDirectory.exists())
outputDirectory.mkdirs();
try {
Gate.init();
} catch (GateException e) {
System.out.println("App::main :: Gate Exception ");
e.printStackTrace();
System.exit(1);
}
if(workdirPath==null) {
workdirPath="";
}
try {
process(inputFilePath, outputFilePath,workdirPath, annotationSet, annotationSetRelationExtraction);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Process directory and convert XML GATE format to JSON
* @param properties_parameters_path
* @throws IOException
*/
public static void process(String inputDirectoryPath, String outputDirectoryPath, String workdir, String annotationSet, String annotationSetRelationExtraction) throws IOException {
System.out.println("App::processTagger :: INIT ");
if (java.nio.file.Files.isDirectory(Paths.get(inputDirectoryPath))) {
File inputDirectory = new File(inputDirectoryPath);
File[] files = inputDirectory.listFiles();
for (File file : files) {
if(file.getName().endsWith(".xml")){
try {
System.out.println("App::process :: processing file : " + file.getAbsolutePath());
String fileOutPutName = file.getName();
File outputGATEFile = new File (outputDirectoryPath + File.separator + fileOutPutName);
processDocumentandaWithStudyAsFinding(file, outputGATEFile, annotationSet, annotationSetRelationExtraction);
} catch (ResourceInstantiationException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (MalformedURLException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (IOException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (Exception e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
}
}
}
}else {
System.out.println("No directory : " + inputDirectoryPath);
}
System.out.println("App::process :: END ");
}
/**
* Execute process in a document
* @param inputFile
* @param outputGATEFile
* @throws ResourceInstantiationException
* @throws IOException
* @throws JsonGenerationException
* @throws InvalidOffsetException
*/
private static void processDocument(File inputFile, File outputGATEFile, String annotationSet, String annotationSetRelationExtraction) throws ResourceInstantiationException, JsonGenerationException, IOException{
gate.Document doc = Factory.newDocument(inputFile.toURI().toURL(), "UTF-8");
AnnotationSet as = doc.getAnnotations(annotationSet);
AnnotationSet findings = (AnnotationSet) as.get("FINDING");
System.out.println("**********************************************rule_finding_init************************************************************");
RelationSet relSet = as.getRelations();
List<Annotation> findings_to_process = new ArrayList<>();
for (Annotation finding : findings.inDocumentOrder()){
/*if(finding.getFeatures().get("text").toString().startsWith("Pale")) {
System.out.println("Smaller term do nothing ");
}*/
AnnotationSet findings_to_merge = as.get("FINDING", finding.getStartNode().getOffset(), finding.getEndNode().getOffset());
if(!findings_to_merge.isEmpty() && findings_to_merge.size()>1) {
int i = 1;
for (Annotation finding_to_merge : findings_to_merge) {
//if there is another finding present then ...
if(!finding_to_merge.getId().equals(finding.getId())){
if(finding.getEndNode().getOffset()-finding.getStartNode().getOffset() <
finding_to_merge.getEndNode().getOffset()-finding_to_merge.getStartNode().getOffset()){
System.out.println("Smaller term do nothing ");
}else if (finding.getEndNode().getOffset()-finding.getStartNode().getOffset() >=
finding_to_merge.getEndNode().getOffset()-finding_to_merge.getStartNode().getOffset()){
//insert internal term with the annotation. This could be used in the future.
finding.getFeatures().put("internal_term_"+i, finding_to_merge);
i=i+1;
//plus add send code or relevant internal information to the biggest annotation
if(finding_to_merge.getFeatures().get("CDISC_SEND_CODE")!=null) {
finding.getFeatures().put("CDISC_SEND_CODE",finding_to_merge.getFeatures().get("CDISC_SEND_CODE").toString());
finding.getFeatures().put("CDISC_CODELIST",finding_to_merge.getFeatures().get("CDISC_CODELIST").toString());
}
/*else if(finding_to_merge.getFeatures().get("ETOX_SEND_CODE")!=null) {
finding.getFeatures().put("ETOX_SEND_CODE",finding_to_merge.getFeatures().get("ETOX_SEND_CODE").toString());
}else if(finding_to_merge.getFeatures().get("ETOX_SEND_DOMAIN_CODE")!=null) {
finding.getFeatures().put("ETOX_SEND_DOMAIN_CODE",finding_to_merge.getFeatures().get("ETOX_SEND_DOMAIN_CODE").toString());
}else if(finding_to_merge.getFeatures().get("MANUAL_SEND_CODE")!=null) {
finding.getFeatures().put("MANUAL_SEND_CODE",finding_to_merge.getFeatures().get("MANUAL_SEND_CODE").toString());
}else if(finding_to_merge.getFeatures().get("LIMTOX_HEPATOTOXICITY")!=null) {//limtox hepatotoxicity text missing
finding.getFeatures().put("LIMTOX_HEPATOTOXICITY",finding_to_merge.getFeatures().get("LIMTOX_HEPATOTOXICITY").toString());
} else {
System.out.println(finding_to_merge);
}*/
//no merge is needed
findings_to_process.add(finding);
}
}
}
}else {
findings_to_process.add(finding);
}
}
//for (Annotation finding : findings.inDocumentOrder()){
Integer finding_id = 1;
for (Annotation finding : findings_to_process){
String str_finding = gate.Utils.stringFor(doc, finding);
AnnotationSet sentences = as.get("Sentence", finding.getStartNode().getOffset(), finding.getEndNode().getOffset());
System.out.println("FINDING: " + str_finding);
for (Annotation sentence : sentences){
String str_sentence = gate.Utils.stringFor(doc, sentence);
System.out.println("Sentencia:");
System.out.println(str_sentence);
finding.getFeatures().put("ANNOTATION_TYPE","FINDING");
finding.getFeatures().put(template_value_name, getSendCodeFinding(finding, gate.Utils.stringFor(doc, finding)));
doc.getAnnotations(annotationSetRelationExtraction).add(finding.getStartNode(), finding.getEndNode(), "FINDING_"+finding_id, finding.getFeatures());
AnnotationSet sentenceFields = as.get(sentence.getStartNode().getOffset(), sentence.getEndNode().getOffset());
sentence.getFeatures().put("ANNOTATION_TYPE", "RELEVANT_TEXT");
doc.getAnnotations(annotationSetRelationExtraction).add(sentence.getStartNode(), sentence.getEndNode(), "FINDING_"+finding_id, sentence.getFeatures());
Annotation ann = isTreatmentRelatedFinding(doc, finding, sentenceFields);
if(ann!=null) {
System.out.println("TREATMENT_RELATED: " + gate.Utils.stringFor(doc, ann));
ann.getFeatures().put("ANNOTATION_TYPE", "IS_TREATMENT_RELATED");
doc.getAnnotations(annotationSetRelationExtraction).add(ann.getStartNode(), ann.getEndNode(), "FINDING_"+finding_id, ann.getFeatures());
}else {
FeatureMap features_uncertain = Factory.newFeatureMap();
features_uncertain.put(template_value_name, "U");
features_uncertain.put("ANNOTATION_TYPE", "IS_TREATMENT_RELATED");
doc.getAnnotations(annotationSetRelationExtraction).add(sentence.getStartNode(), sentence.getEndNode(), "FINDING_"+finding_id, features_uncertain);
}
Annotation manifestation_finding = getClosestAnnotation(doc, sentenceFields, finding, "MANIFESTATION_FINDING");
if(manifestation_finding!=null) {
System.out.println("MANIFESTATION OF FINDING: " + gate.Utils.stringFor(doc, manifestation_finding));
manifestation_finding.getFeatures().put(template_value_name, getSendCode(manifestation_finding, gate.Utils.stringFor(doc, manifestation_finding)));
manifestation_finding.getFeatures().put("ANNOTATION_TYPE",manifestation_finding.getType());
doc.getAnnotations(annotationSetRelationExtraction).add(manifestation_finding.getStartNode(), manifestation_finding.getEndNode(), "FINDING_"+finding_id, manifestation_finding.getFeatures());
}
/*else {
FeatureMap features_uncertain = Factory.newFeatureMap();
features_uncertain.put(template_value_name, "P");
features_uncertain.put("ANNOTATION_TYPE", "MANIFESTATION_FINDING");
doc.getAnnotations(annotationSet).add(manifestation_finding.getStartNode(), manifestation_finding.getEndNode(), "FINDING_"+finding_id, manifestation_finding.getFeatures());
}*/
Annotation specimen = getClosestAnnotation(doc, sentenceFields, finding, "SPECIMEN");
if(specimen!=null) {
System.out.println("SPECIMEN: " + gate.Utils.stringFor(doc, specimen));
specimen.getFeatures().put(template_value_name, getSendCode(specimen, gate.Utils.stringFor(doc, specimen)));
specimen.getFeatures().put("ANNOTATION_TYPE",specimen.getType());
doc.getAnnotations(annotationSetRelationExtraction).add(specimen.getStartNode(), specimen.getEndNode(), "FINDING_"+finding_id, specimen.getFeatures());
}
Annotation STUDY_TESTCD = getClosestAnnotation(doc, sentenceFields, finding, "STUDY_TESTCD");
if(STUDY_TESTCD!=null) {
System.out.println("STUDY_TESTCD: " + gate.Utils.stringFor(doc, STUDY_TESTCD));
STUDY_TESTCD.getFeatures().put("ANNOTATION_TYPE",STUDY_TESTCD.getType());
STUDY_TESTCD.getFeatures().put(template_value_name, getSendCode(STUDY_TESTCD, gate.Utils.stringFor(doc, STUDY_TESTCD)));
doc.getAnnotations(annotationSetRelationExtraction).add(STUDY_TESTCD.getStartNode(), STUDY_TESTCD.getEndNode(), "FINDING_"+finding_id, STUDY_TESTCD.getFeatures());
}
Annotation STUDY_DOMAIN = getClosestAnnotation(doc, sentenceFields, finding, "STUDY_DOMAIN");
if(STUDY_DOMAIN!=null) {
System.out.println("STUDY_DOMAIN: " + gate.Utils.stringFor(doc, STUDY_DOMAIN));
STUDY_DOMAIN.getFeatures().put("ANNOTATION_TYPE",STUDY_DOMAIN.getType());
STUDY_DOMAIN.getFeatures().put(template_value_name, getSendCode(STUDY_DOMAIN, gate.Utils.stringFor(doc, STUDY_DOMAIN)));
doc.getAnnotations(annotationSetRelationExtraction).add(STUDY_DOMAIN.getStartNode(), STUDY_DOMAIN.getEndNode(), "FINDING_"+finding_id, STUDY_DOMAIN.getFeatures());
}
Annotation risk_level = getClosestAnnotation(doc, sentenceFields, finding, "RISK_LEVEL");
if(risk_level!=null) {
System.out.println("RISK_LEVEL: " + gate.Utils.stringFor(doc, risk_level));
//risk_level.getFeatures().put(template_value_name, getSendCode(risk_level, gate.Utils.stringFor(doc, risk_level)));
risk_level.getFeatures().put(template_value_name, gate.Utils.stringFor(doc, risk_level));
risk_level.getFeatures().put("ANNOTATION_TYPE",risk_level.getType());
doc.getAnnotations(annotationSetRelationExtraction).add(risk_level.getStartNode(), risk_level.getEndNode(), "FINDING_"+finding_id, risk_level.getFeatures());
}
Annotation DOSE_QUANTITY = getClosestAnnotation(doc, sentenceFields, finding, "DOSE_QUANTITY");
if(DOSE_QUANTITY!=null) {
System.out.println("DOSE_QUANTITY: " + gate.Utils.stringFor(doc, DOSE_QUANTITY));
DOSE_QUANTITY.getFeatures().put("ANNOTATION_TYPE",DOSE_QUANTITY.getType());
DOSE_QUANTITY.getFeatures().put(template_value_name, gate.Utils.stringFor(doc, DOSE_QUANTITY));
doc.getAnnotations(annotationSetRelationExtraction).add(DOSE_QUANTITY.getStartNode(), DOSE_QUANTITY.getEndNode(), "FINDING_"+finding_id, DOSE_QUANTITY.getFeatures());