Commit 6b31b7d5 authored by jcorvi's avatar jcorvi
Browse files

Initial commit

parents
#
# Project specific excludes
#
tomcat
#
# Default excludes
#
# Binaries
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
*.war
*.ear
*.sar
*.class
# Maven
target/
# IntelliJ project files
*.iml
*.iws
*.ipr
.idea/
# eclipse project file
.settings/
.classpath
.project
# NetBeans specific
nbproject/private/
build/
nbbuild/
dist/
nbdist/
nbactions.xml
nb-configuration.xml
# OS
.DS_Store
# Misc
*.swp
release.properties
pom.xml.releaseBackup
pom.xml.tag
#custom
pos/
FROM alpine:3.9
WORKDIR /usr/local/share/adesexporttojson
ARG ADES_EXPORT_JSON_VERSION=1.0
COPY docker-build.sh /usr/local/bin/docker-build.sh
COPY src src
COPY pom.xml .
RUN chmod u=rwx,g=rwx,o=r /usr/local/share/adesexporttojson -R
RUN docker-build.sh ${ADES_EXPORT_JSON_VERSION}
# ades-export-to-json
This component export the treatment-related findings annotated in XML GATE format to JSON format.
## Description
This component is only used inside the treatment-related findings pipeline because is tired up to that specific domain.
## For clone this component
git clone --depth 1 https://github.com/inab/docker-textmining-tools.git ades-export-to-json
cd ades-export-to-json
git filter-branch --prune-empty --subdirectory-filter ades-export-to-json HEAD
## Build and Run the Docker
# To build the docker, just go into the ades-export-to-json folder and execute
docker build -t ades-export-to-json .
#To run the docker, just set the input_folder and the output
mkdir ${PWD}/output_annotation; docker run --rm -u $UID -v ${PWD}/input_folder:/in:ro -v ${PWD}/output_annoation:/out:rw ades-export-to-json ades-export-to-json -i /in -o /out -a MY_SET_NAME
Parameters:
<p>
-i input folder with the documents to annotated. The documents could be plain txt or xml gate documents.
</p>
<p>
-o output folder with the documents annotated in gate format.
</p>
<p>
-a annotation set output
</p>
## Built With
* [Docker](https://www.docker.com/) - Docker Containers
* [Maven](https://maven.apache.org/) - Dependency Management
* [GATE](https://gate.ac.uk/overview.html) - GATE: a full-lifecycle open source solution for text processing
## Versioning
We use [SemVer](http://semver.org/) for versioning. For the versions available, see the [tags on this repository](https://github.com/inab/docker-textmining-tools/edit/master/nlp-standard-preprocessing/tags).
## Authors
* **Javier Corvi**
## License
This project is licensed under the GNU GENERAL PUBLIC LICENSE Version 3 - see the [LICENSE.md](LICENSE.md) file for details
#!/bin/sh
BASEDIR=/usr/local
ADES_EXPORT_JSON_HOME="${BASEDIR}/share/adesexporttojson/"
ADES_EXPORT_JSON_VERSION=1.0
# Exit on error
set -e
if [ $# -ge 1 ] ; then
ADES_EXPORT_JSON_VERSION="$1"
fi
if [ -f /etc/alpine-release ] ; then
# Installing OpenJDK 8
apk add --update openjdk8-jre
# ades development dependencies
apk add openjdk8 git maven
else
# Runtime dependencies
apt-get update
apt-get install openjdk-8-jre
# The development dependencies
apt-get install openjdk-8-jdk git maven
fi
mvn clean install -DskipTests
#rename jar
mv target/ades-export-to-json-0.0.1-SNAPSHOT-jar-with-dependencies.jar ades-export-to-json-${ADES_EXPORT_JSON_VERSION}.jar
cat > /usr/local/bin/ades-export-to-json <<EOF
#!/bin/sh
exec java \$JAVA_OPTS -jar "${ADES_EXPORT_JSON_HOME}/ades-export-to-json-${ADES_EXPORT_JSON_VERSION}.jar" -workdir "${ADES_EXPORT_JSON_HOME}" "\$@"
EOF
chmod +x /usr/local/bin/ades-export-to-json
#delete target
rm -R target src pom.xml
#add bash for nextflow
apk add bash
if [ -f /etc/alpine-release ] ; then
# Removing not needed tools
apk del openjdk8 git maven
rm -rf /var/cache/apk/*
else
apt-get remove openjdk-8-jdk git maven
rm -rf /var/cache/dpkg
fi
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>es.bsc.inb.nlp</groupId>
<artifactId>ades-export-to-json</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>ades_tagger</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-core</artifactId>
<version>8.5.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>
es.bsc.inb.ades.export.json.main.App
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package es.bsc.inb.ades.export.json.main;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.Gate;
import gate.creole.ResourceInstantiationException;
import gate.util.GateException;
import gate.util.InvalidOffsetException;
/**
* ADES Export to JSON.
*
* Export from GATE format to JSON.
*
*
* @author jcorvi
*
*/
public class App {
static final String template_value_name = "value";
public static void main(String[] args ){
Options options = new Options();
Option input = new Option("i", "input", true, "input directory path");
input.setRequired(true);
options.addOption(input);
Option output = new Option("o", "output", true, "output directory path");
output.setRequired(true);
options.addOption(output);
Option set = new Option("a", "annotation_set", true, "Annotation set where the annotation will be included");
set.setRequired(true);
options.addOption(set);
Option annotation_set_relation_extraction = new Option("ar", "annotation_set_relation_extraction", true, "Annotation set where the relation extraction will be included");
annotation_set_relation_extraction.setRequired(true);
options.addOption(annotation_set_relation_extraction);
Option workdir = new Option("workdir", "workdir", true, "workDir directory path");
workdir.setRequired(false);
options.addOption(workdir);
CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp("utility-name", options);
System.exit(1);
}
String inputFilePath = cmd.getOptionValue("input");
String outputFilePath = cmd.getOptionValue("output");
String workdirPath = cmd.getOptionValue("workdir");
String annotationSet = cmd.getOptionValue("annotation_set");
String annotationSetRelationExtraction = cmd.getOptionValue("annotation_set_relation_extraction");
if (!java.nio.file.Files.isDirectory(Paths.get(inputFilePath))) {
System.out.println("Please set the inputDirectoryPath ");
System.exit(1);
}
if (annotationSet==null) {
System.out.println("Please set the annotation set where the annotation will be included");
System.exit(1);
}
if (annotationSetRelationExtraction==null) {
System.out.println("Please set the annotation relation extraction output set where the relations will be included");
System.exit(1);
}
File outputDirectory = new File(outputFilePath);
if(!outputDirectory.exists())
outputDirectory.mkdirs();
try {
Gate.init();
} catch (GateException e) {
System.out.println("App::main :: Gate Exception ");
e.printStackTrace();
System.exit(1);
}
if(workdirPath==null) {
workdirPath="";
}
try {
process(inputFilePath, outputFilePath,workdirPath, annotationSet, annotationSetRelationExtraction);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Process directory and convert XML GATE format to JSON
* @param properties_parameters_path
* @throws IOException
*/
public static void process(String inputDirectoryPath, String outputDirectoryPath, String workdir, String annotationSet, String annotationSetRelationExtraction) throws IOException {
System.out.println("App::processTagger :: INIT ");
if (java.nio.file.Files.isDirectory(Paths.get(inputDirectoryPath))) {
File inputDirectory = new File(inputDirectoryPath);
File[] files = inputDirectory.listFiles();
for (File file : files) {
if(file.getName().endsWith(".xml")){
try {
System.out.println("App::process :: processing file : " + file.getAbsolutePath());
String fileOutPutName = file.getName().replace(".xml", ".json");
File outputGATEFile = new File (outputDirectoryPath + File.separator + fileOutPutName);
processDocument(file, outputGATEFile, annotationSet, annotationSetRelationExtraction);
} catch (ResourceInstantiationException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (MalformedURLException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (IOException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
} catch (Exception e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
}
}
}
}else {
System.out.println("No directory : " + inputDirectoryPath);
}
System.out.println("App::process :: END ");
}
/**
* Execute process in a document
* @param inputFile
* @param outputGATEFile
* @throws ResourceInstantiationException
* @throws IOException
* @throws JsonGenerationException
* @throws InvalidOffsetException
*/
private static void processDocument(File inputFile, File outputGATEFile, String annotationSet, String annotationSetRelationExtraction ) throws ResourceInstantiationException, JsonGenerationException, IOException, InvalidOffsetException{
gate.Document doc = Factory.newDocument(inputFile.toURI().toURL(), "UTF-8");
Gson gsonBuilder = new GsonBuilder().create();
JsonObject document = new JsonObject();
document.addProperty("name", doc.getName().substring(0, doc.getName().indexOf(".xml")+4));
String plainText = doc.getContent().getContent(0l, gate.Utils.lengthLong(doc)).toString();
document.addProperty("text", plainText);
document.addProperty("textWithAnnotations", new String(Files.readAllBytes(inputFile.getAbsoluteFile().toPath()), StandardCharsets.UTF_8));
document.addProperty("id", System.currentTimeMillis());
JsonObject section = new JsonObject();
section.addProperty("name", "document");
Set<String> types = Stream.of("FINDING","SEX","SPECIMEN","GROUP","DOSE","DOSE_DURATION","DOSE_QUANTITY","DOSE_FREQUENCY","MANIFESTATION_FINDING","RISK_LEVEL","NO_TREATMENT_RELATED_TRIGGER",
"TREATMENT_RELATED_TRIGGER","STUDY_DOMAIN","STUDY_DAY_FINDING","STUDY_TESTCD", "ROUTE_OF_ADMINISTRATION","MODE_OF_ACTION","STATISTICAL_SIGNIFICANCE","CYPS").collect(Collectors.toCollection(HashSet::new));
JsonObject entities = new JsonObject();
AnnotationSet as = doc.getAnnotations(annotationSet).get(types);
for (String type : as.getAllTypes()) {
JsonArray type_array = new JsonArray();
for (Annotation annotation : as.get(type).inDocumentOrder()) {
JsonObject annotationObject = new JsonObject();
annotationObject.addProperty("type", annotation.getType());
annotationObject.addProperty("text", gate.Utils.stringFor(doc, annotation));
annotationObject.addProperty("startOffset", annotation.getStartNode().getOffset());
annotationObject.addProperty("endOffset", annotation.getEndNode().getOffset());
JsonArray features = new JsonArray();
for (Object key : annotation.getFeatures().keySet()) {
JsonObject feature = new JsonObject();
feature.addProperty("name", key.toString());
feature.addProperty("value", annotation.getFeatures().get(key).toString());
features.add(feature);
}
annotationObject.add("features", features);
type_array.add(annotationObject);
}
entities.add(type, type_array);
}
document.add("annotations", entities);
JsonArray findings = new JsonArray();
AnnotationSet as2 = doc.getAnnotations(annotationSetRelationExtraction);
int id = 0;
for (String finding : sortFindings(as2.getAllTypes())) {
JsonObject findingObject = new JsonObject();
id = id +1;
findingObject.addProperty("id", id);
Map<String, List<Annotation>> annotations_findings_by_type = new HashMap<String, List<Annotation>>();
for (Annotation findingElement : as2.get(finding).inDocumentOrder()) {
Object annotationType = findingElement.getFeatures().get("ANNOTATION_TYPE");
if(annotationType!=null) {
if(annotations_findings_by_type.get(annotationType)==null) {
annotations_findings_by_type.put(annotationType.toString(), new ArrayList<Annotation>());
}
annotations_findings_by_type.get(annotationType).add(findingElement);
}else {
System.out.print("No tiene annotation type: " + findingElement);
}
}
for (String key : annotations_findings_by_type.keySet()) {
List<Annotation> annotations_by_type = annotations_findings_by_type.get(key);
Annotation annotation_by_type = annotations_by_type.get(0);
JsonObject findingElementObject = new JsonObject();
findingElementObject.addProperty("text", gate.Utils.stringFor(doc, annotation_by_type));
if(annotation_by_type.getFeatures().get(template_value_name)!=null) {
findingElementObject.addProperty(template_value_name, annotation_by_type.getFeatures().get(template_value_name).toString());
}
findingElementObject.addProperty("startOffset", annotation_by_type.getStartNode().getOffset());
findingElementObject.addProperty("endOffset", annotation_by_type.getEndNode().getOffset());
JsonArray features = new JsonArray();
for (Object key2 : annotation_by_type.getFeatures().keySet()) {
JsonObject feature = new JsonObject();
feature.addProperty("name", key2.toString());
feature.addProperty("value", annotation_by_type.getFeatures().get(key2).toString());
features.add(feature);
}
findingElementObject.add("features", features);
findingObject.add(key, findingElementObject);
}
findings.add(findingObject);
}
document.add("findings", findings);
java.io.Writer out = new java.io.BufferedWriter(new java.io.OutputStreamWriter(new FileOutputStream(outputGATEFile, false)));
out.write(gsonBuilder.toJson(document));
out.close();
}
private static List<String> sortFindings(Set<String> allTypes) {
List<String> mainList = new ArrayList<String>();
mainList.addAll(allTypes);
Collections.sort(mainList, new NumberAwareStringComparator());
return mainList;
}
}
package es.bsc.inb.ades.export.json.main;
import java.math.BigInteger;
import java.util.Comparator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NumberAwareStringComparator implements Comparator<CharSequence> {
public static final NumberAwareStringComparator INSTANCE =
new NumberAwareStringComparator();
private static final Pattern PATTERN = Pattern.compile("(\\D*)(\\d*)");
NumberAwareStringComparator() {
}
public int compare(CharSequence s1, CharSequence s2) {
Matcher m1 = PATTERN.matcher(s1);
Matcher m2 = PATTERN.matcher(s2);
// The only way find() could fail is at the end of a string
while (m1.find() && m2.find()) {
// matcher.group(1) fetches any non-digits captured by the
// first parentheses in PATTERN.
int nonDigitCompare = m1.group(1).compareTo(m2.group(1));
if (0 != nonDigitCompare) {
return nonDigitCompare;
}
// matcher.group(2) fetches any digits captured by the
// second parentheses in PATTERN.
if (m1.group(2).isEmpty()) {
return m2.group(2).isEmpty() ? 0 : -1;
} else if (m2.group(2).isEmpty()) {
return +1;
}
BigInteger n1 = new BigInteger(m1.group(2));
BigInteger n2 = new BigInteger(m2.group(2));
int numberCompare = n1.compareTo(n2);
if (0 != numberCompare) {
return numberCompare;
}
}
// Handle if one string is a prefix of the other.
// Nothing comes before something.
return m1.hitEnd() && m2.hitEnd() ? 0 :
m1.hitEnd() ? -1 : +1;
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment