Commit 2f6684da authored by Javi Corvi's avatar Javi Corvi
Browse files

Initial commit

parents
#
# Project specific excludes
#
tomcat
#
# Default excludes
#
# Binaries
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
*.war
*.ear
*.sar
*.class
# Maven
target/
# IntelliJ project files
*.iml
*.iws
*.ipr
.idea/
# eclipse project file
.settings/
.classpath
.project
# NetBeans specific
nbproject/private/
build/
nbbuild/
dist/
nbdist/
nbactions.xml
nb-configuration.xml
# OS
.DS_Store
# Misc
*.swp
release.properties
pom.xml.releaseBackup
pom.xml.tag
#custom
pos/
FROM alpine:3.9
WORKDIR /usr/local/share/dictionary_annotation
ARG DICT_TAGGER_VERSION=1.0
COPY docker-build.sh /usr/local/bin/docker-build.sh
COPY src src
COPY jape_rules jape_rules
COPY dictionaries dictionaries
COPY pom.xml .
RUN mkdir logs
RUN chmod u=rwx,g=rwx,o=r /usr/local/share/dictionary_annotation -R
RUN chmod u=rwx,g=rwx,o=rwx logs -R
RUN docker-build.sh ${DICT_TAGGER_VERSION}
# nlp-gate-generic-component
Text mining GATE generic component for run in Batch/Pipeline mode.
## Description
This component is a docker wrapper that execute the GATE ANNIE DefaultGazeteer and JAPE rules in batch mode.
The tool execute the Default Gazeteer Lookup given dictionaries passed as parameters and, in a second stage, execute JAPE rules given a main.jape file.
The list of the dictionaries/gazeteers entries has to be provided as in the GATE DefaultGazzeteer format.
More information about ANNIE DefaultGazeteer: https://gate.ac.uk/sale/tao/splitch13.html#x18-32200013.2
More information about JAPE rules:
https://gate.ac.uk/sale/thakker-jape-tutorial/GATE%20JAPE%20manual.pdf
This library is very useful if you need to execute gazeteers lookup and JAPE rules in batch mode, inside a Nextflow pipeline for example.
## For clone this component
git clone --depth 1 https://github.com/inab/docker-textmining-tools.git nlp-gate-generic-component
cd nlp-gate-generic-component
git filter-branch --prune-empty --subdirectory-filter nlp-gate-generic-component HEAD
## Build and Run the Docker
# To build the docker, just go into the ades-tagger folder and execute
docker build -t nlp-gate-generic-component .
#To run the docker, just set the input_folder and the output
mkdir ${PWD}/output_folder; docker run --rm -u $UID -v ${PWD}/input_folder:/in:ro -v ${PWD}/output_folder:/out:rw nlp-gate-generic-component nlp-gate-generic-component -i /in -o /out -a ANNOTATION_SET -l in/dictionaries/lists.def -j in/jape_rules/main.jape
Parameters:
<p>
-i input folder with the documents to annotated. The documents could be plain txt or xml gate documents.
</p>
<p>
-o output folder with the documents annotated in gate format.
</p>
<p>
-a Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules.
</p>
<p>
-ia Input Annotation Set. If you want to provided different input annotation, set this parameter. By default the -a output annotation set is used as input.
</p>
<p>
-l list definition of the dictionary in GATE format.
</p>
<p>
-j main.jape path with the JAPE rules to be executed.
</p>
In this example the dictionaries/gazeteers and the jape rules are in the input folder.
## Built With
* [Docker](https://www.docker.com/) - Docker Containers
* [Maven](https://maven.apache.org/) - Dependency Management
* [GATE](https://gate.ac.uk/overview.html) - GATE: a full-lifecycle open source solution for text processing
## Versioning
We use [SemVer](http://semver.org/) for versioning. For the versions available, see the [tags on this repository](https://github.com/inab/docker-textmining-tools/edit/master/nlp-standard-preprocessing/tags).
## Authors
* **Javier Corvi**
## License
This project is licensed under the GNU GENERAL PUBLIC LICENSE Version 3 - see the [LICENSE.md](LICENSE.md) file for details
#!/bin/sh
BASEDIR=/usr/local
DICT_TAGGER_HOME="${BASEDIR}/share/dictionary_annotation/"
DICT_TAGGER_VERSION=1.0
# Exit on error
set -e
if [ $# -ge 1 ] ; then
DICT_TAGGER_VERSION="$1"
fi
if [ -f /etc/alpine-release ] ; then
# Installing OpenJDK 8
apk add --update openjdk8-jre
# dict tagger development dependencies
apk add openjdk8 git maven
else
# Runtime dependencies
apt-get update
apt-get install openjdk-8-jre
# The development dependencies
apt-get install openjdk-8-jdk git maven
fi
mvn clean install -DskipTests
#rename jar
mv target/nlp-generic-dictionary-annotation-0.0.1-SNAPSHOT-jar-with-dependencies.jar nlp-generic-dictionary-annotation-${DICT_TAGGER_VERSION}.jar
cat > /usr/local/bin/nlp-generic-dictionary-annotation <<EOF
#!/bin/sh
exec java \$JAVA_OPTS -jar "${DICT_TAGGER_HOME}/nlp-generic-dictionary-annotation-${DICT_TAGGER_VERSION}.jar" -workdir "${DICT_TAGGER_HOME}" "\$@"
EOF
chmod +x /usr/local/bin/nlp-generic-dictionary-annotation
#delete target
rm -R target src pom.xml
#add bash for nextflow
apk add bash
if [ -f /etc/alpine-release ] ; then
# Removing not needed tools
apk del openjdk8 git maven
rm -rf /var/cache/apk/*
else
apt-get remove openjdk-8-jdk git maven
rm -rf /var/cache/dpkg
fi
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>es.bsc.inb.nlp</groupId>
<artifactId>nlp-gate-generic-component</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>nlp-gate-generic-component</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-core</artifactId>
<version>8.5.1</version>
</dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>
es.bsc.inb.nlp.gate.generic.component.main.App
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package es.bsc.inb.nlp.gate.generic.component.main;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Paths;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.ProcessingResource;
import gate.creole.Plugin;
import gate.creole.SerialAnalyserController;
import gate.util.ExtensionFileFilter;
import gate.util.GateException;
/**
* Generic Library for execute GATE Dictionary/Gazetteer and JAPE rules processing in batch mode.
*
*/
public class App {
public static void main( String[] args ){
Options options = new Options();
Option input = new Option("i", "input", true, "input directory path");
input.setRequired(true);
options.addOption(input);
Option output = new Option("o", "output", true, "output directory path");
output.setRequired(true);
options.addOption(output);
Option listDefinitions = new Option("l", "lists_definitions", true, "Dictionary List definitions, Gate format.");
listDefinitions.setRequired(false);
options.addOption(listDefinitions);
Option japeMain = new Option("j", "jape_main", true, "Jape Main file for processing rules");
japeMain.setRequired(false);
options.addOption(japeMain);
Option set = new Option("a", "annotation_set", true, "Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules");
set.setRequired(true);
options.addOption(set);
Option iset = new Option("ia", "input_annotation_set", true, "Input Annotation Set. If you want to provided different input annotation set this parameter. By default the -a output annotation set is used as input.");
iset.setRequired(false);
options.addOption(iset);
Option workdir = new Option("w", "workdir", true, "workDir directory path");
workdir.setRequired(false);
options.addOption(workdir);
CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp("utility-name", options);
System.exit(1);
}
String inputFilePath = cmd.getOptionValue("input");
String outputFilePath = cmd.getOptionValue("output");
String workdirPath = cmd.getOptionValue("workdir");
String annotationSet = cmd.getOptionValue("annotation_set");
String inputAnnotationSet = cmd.getOptionValue("input_annotation_set");
String listsDefinitionsPath = cmd.getOptionValue("lists_definitions");
String japeMainPath = cmd.getOptionValue("jape_main");
if (!java.nio.file.Files.isDirectory(Paths.get(inputFilePath))) {
System.out.println(" Please set the inputDirectoryPath ");
System.exit(1);
}
if (annotationSet==null) {
System.out.println("Please set the annotation set where the annotation will be included");
System.exit(1);
}
if (inputAnnotationSet==null) {
System.out.println("The input annotation set not set, same as output is selected");
inputAnnotationSet = annotationSet;
}
if(workdirPath==null) {
workdirPath = "";
}
Boolean execution = false;
if(listsDefinitionsPath==null) {
System.out.println("No dictionary was provided.");
}else {
listsDefinitionsPath = workdirPath+listsDefinitionsPath;
execution = true;
if (!java.nio.file.Files.isRegularFile(Paths.get(listsDefinitionsPath))) {
System.out.println("Please set a correct path to the list of dictionaries to annotate");
System.exit(1);
}
}
if(japeMainPath==null) {
System.out.println("No Jape Main Rules were provided.");
}else {
japeMainPath = workdirPath+japeMainPath;
execution = true;
if (!java.nio.file.Files.isRegularFile(Paths.get(japeMainPath))) {
System.out.println("Please set a correct path to the main jape rules");
System.exit(1);
}
}
if(!execution) {
System.out.println("No gazzeter or Jape Rules were provided. There is nothing to do. Please review your configuration");
System.exit(1);
}
File outputDirectory = new File(outputFilePath);
if(!outputDirectory.exists())
outputDirectory.mkdirs();
try {
Gate.init();
} catch (GateException e) {
System.out.println("App :: main :: Gate Exception ");
e.printStackTrace();
System.exit(1);
}
try {
process(inputFilePath, outputFilePath, listsDefinitionsPath, japeMainPath, inputAnnotationSet, annotationSet, workdirPath);
} catch (GateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Annotation Process
* @param inputDirectory
* @param outputDirectory
* @throws GateException
* @throws IOException
*/
private static void process(String inputDirectory, String outputDirectory, String listsDefinitionsPath, String japeRules, String inputAnnotationSet, String outAnnotationSet, String workdirPath) throws GateException, IOException {
try {
System.out.println("App :: main :: INIT PROCESS");
Corpus corpus = Factory.newCorpus("My Files");
File directory = new File(inputDirectory);
ExtensionFileFilter filter = new ExtensionFileFilter("Txt files", new String[]{"txt","xml"});
URL url = directory.toURL();
corpus.populate(url, filter, null, false);
Plugin anniePlugin = new Plugin.Maven("uk.ac.gate.plugins", "annie", "8.5");
Gate.getCreoleRegister().registerPlugin(anniePlugin);
// create a serial analyser controller to run ANNIE with
SerialAnalyserController annieController = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController",
Factory.newFeatureMap(), Factory.newFeatureMap(), "ANNIE");
annieController.setCorpus(corpus);
ProcessingResource pr_gazetter = null;
if(listsDefinitionsPath!=null) {
//Gazetter parameters
FeatureMap params = Factory.newFeatureMap();
params.put("listsURL", new File(listsDefinitionsPath).toURL());
params.put("gazetteerFeatureSeparator", "\t");
params.put("caseSensitive",false);
pr_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params);
pr_gazetter.setParameterValue("annotationSetName", outAnnotationSet);
annieController.add(pr_gazetter);
}
LanguageAnalyser jape = null;
if(japeRules!=null) {
jape = (LanguageAnalyser)gate.Factory.createResource("gate.creole.Transducer", gate.Utils.featureMap(
"grammarURL", new File(japeRules).toURI().toURL(),"encoding", "UTF-8"));
jape.setParameterValue("inputASName", inputAnnotationSet);
jape.setParameterValue("outputASName", outAnnotationSet);
annieController.add(jape);
}
// execute controller
annieController.execute();
//free resources
if(pr_gazetter!=null) {
Factory.deleteResource(pr_gazetter);
}
if(jape!=null) {
Factory.deleteResource(jape);
}
Factory.deleteResource(annieController);
Gate.removeKnownPlugin(anniePlugin);
//Save documents in different output
for (Document document : corpus) {
String nameOutput = "";
if(document.getName().indexOf(".txt")!=-1) {
nameOutput = document.getName().substring(0, document.getName().indexOf(".txt")+4).replace(".txt", ".xml");
}else {
nameOutput = document.getName().substring(0, document.getName().indexOf(".xml")+4);
}
java.io.Writer out = new java.io.BufferedWriter(new java.io.OutputStreamWriter(new FileOutputStream(new File(outputDirectory + File.separator + nameOutput), false)));
out.write(document.toXml());
out.close();
}
}catch(Exception e) {
System.out.println("App :: main :: ERROR ");
e.printStackTrace();
System.exit(1);
}
System.out.println("App :: main :: END PROCESS");
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment