From 51560d1d8ea5dd94c93a450dcd5d5fdb60702d7e Mon Sep 17 00:00:00 2001 From: javi Date: Tue, 16 Jun 2020 15:49:31 +0200 Subject: [PATCH] New Feature: Flexible Gazetter. --- CHANGELOG | 19 ++-- README.md | 50 ++++++---- .../nlp/gate/generic/component/main/App.java | 94 ++++++++++++++----- 3 files changed, 113 insertions(+), 50 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 69f6083..5b1b46a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,17 +1,24 @@ # Change Log -## Version 1.0, 2020-03-03 +## Version 1.2, 2020-06-16 + +Add Flexible Gazeetter Processing Resource. Enable the posibility to run this gazetter lookup using features inside the Token annotation. + +Minor modification of parameters names, using the same parameters as in the GATE plugins definitions. + +## Version 1.2, 2020-03-25 + +Externalization of Gate parameters: gazetteerFeatureSeparator, caseSensitive and longestMatchOnly. Now this parameters can be passed to the component. +To see the defaults please go to help. -First version of the component. +Internal Map parameters in order to be clear managing the parameters. ## Version 1.1, 2020-03-10 Posibility of adding a .zip file in the dictionary definition. Parameter -l ---> Dictionary List definitions. A lists.def Gate-formatted file separated by tab can be provided or a zip file that contains the dictionary/gazetteer files including the lists.def -## Version 1.2, 2020-03-25 -Externalization of Gate parameters: gazetteerFeatureSeparator, caseSensitive and longestMatchOnly. Now this parameters can be passed to the component. -To see the defaults please go to help. +## Version 1.0, 2020-03-03 -Internal Map parameters in order to be clear managing the parameters. \ No newline at end of file +First version of the component. \ No newline at end of file diff --git a/README.md b/README.md index ba120ea..36ce17b 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,24 @@ # nlp-gate-generic-component -Text mining GATE generic component for run in Batch/Pipeline mode. +Text mining GATE generic component for run in Batch/Pipeline mode using software containers (dockers). ## Description -This component is a docker wrapper that execute the GATE ANNIE DefaultGazeteer and JAPE rules in batch mode. +This tool execute the Default Gazeteer or Flexible Gazetter Lookup given dictionaries passed as parameters and, in a second stage, execute JAPE rules given a main.jape file. +The list of the dictionaries/gazeteers entries has to be provided as in the GATE format. +The ANNIE SerialAnalyserController is used to execute the pipeline. -The tool execute the Default Gazeteer Lookup given dictionaries passed as parameters and, in a second stage, execute JAPE rules given a main.jape file. +This component is a docker wrapper that executes, in batch mode, GATE Processing Resources: -The list of the dictionaries/gazeteers entries has to be provided as in the GATE DefaultGazzeteer format. + DefaultGazeteer: https://gate.ac.uk/sale/tao/splitch13.html#x18-32200013.2 + FlexibleGazetter: https://gate.ac.uk/sale/tao/splitch13.html#x18-33300013.6 + Jape Transducer: https://gate.ac.uk/sale/thakker-jape-tutorial/GATE%20JAPE%20manual.pdf -More information about ANNIE DefaultGazeteer: https://gate.ac.uk/sale/tao/splitch13.html#x18-32200013.2 +To this aim it uses the corresponding GATE plugins present in ANNIE and in TOOLS. -More information about JAPE rules: -https://gate.ac.uk/sale/thakker-jape-tutorial/GATE%20JAPE%20manual.pdf +This library is useful if you need to execute gazeteers lookup and JAPE rules in batch mode, for example, using Nextflow as workflow manager. -This library is very useful if you need to execute gazeteers lookup and JAPE rules in batch mode, inside a Nextflow pipeline for example. - -## Actual Version: 1.2, 2020-03-25 +## Actual Version: 1.3, 2020-06-16 ## [Changelog](https://gitlab.bsc.es/inb/text-mining/generic-tools/nlp-gate-generic-component/blob/master/CHANGELOG) ## Docker @@ -31,25 +32,38 @@ javicorvi/nlp-gate-generic-component mkdir ${PWD}/output_folder; docker run --rm -u $UID -v ${PWD}/input_folder:/in:ro -v ${PWD}/output_folder:/out:rw nlp-gate-generic-component nlp-gate-generic-component -i /in -o /out -a ANNOTATION_SET -l in/dictionaries/lists.def -j in/jape_rules/main.jape Parameters:

--i input folder with the documents to annotated. The documents could be plain txt or xml gate documents. +-i or input: input folder with the documents to annotated. The documents could be plain txt or xml gate documents. +

+

+-o or -output: folder with the documents annotated in gate format. +

+

+-gt or -gazetter_type: Gazetter type: default, flexible. If no value is provided the DefautlGazetter is used +

+

+-inputFeatureNames: See flexible gazetter required fields. These feature values are used to replace the corresponding original text. +

+

+-a or outputASName: Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules. +

+

+-ia or inputASName: Input Annotation Set. If you want to provided different input annotation, set this parameter. By default the -a output annotation set is used as input.

--o output folder with the documents annotated in gate format. +-l or listsURL: Dictionary List definitions. A lists.def Gate-formatted file separated by tab can be provided or a zip file that contains the dictionary/gazetteer files including the lists.def

--a Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules. +-gazetteerFeatureSeparator: The character used to add arbitrary features to gazetteer entries. Default tab.

--ia Input Annotation Set. If you want to provided different input annotation, set this parameter. By default the -a output annotation set is used as input. +-caseSensitive: Should the gazetteer be case sensitive during matching. Default false

--l Dictionary List definitions. A lists.def Gate-formatted file separated by tab can be provided or a zip file that contains the dictionary/gazetteer files including the lists.def +-longestMatchOnly: This parameter is only relevant when the list of lookups contains proper prefixes. The default behaviour (when this parameter is set to true) is to only match the longest entry. Setting this parameter to false will cause the gazetteer to match all possible prefixes.

--j main.jape path with the JAPE rules to be executed. +-j or jape_main: main.jape path with the JAPE rules to be executed.

- -In this example the dictionaries/gazeteers and the jape rules are in the input folder. ## Built With diff --git a/src/main/java/es/bsc/inb/nlp/gate/generic/component/main/App.java b/src/main/java/es/bsc/inb/nlp/gate/generic/component/main/App.java index 2429cca..828e657 100644 --- a/src/main/java/es/bsc/inb/nlp/gate/generic/component/main/App.java +++ b/src/main/java/es/bsc/inb/nlp/gate/generic/component/main/App.java @@ -6,6 +6,8 @@ import java.io.FileOutputStream; import java.io.IOException; import java.net.URL; import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.zip.ZipEntry; @@ -32,7 +34,7 @@ import gate.util.ExtensionFileFilter; import gate.util.GateException; /** - * Generic Library for execute GATE Dictionary/Gazetteer and JAPE rules processing in batch mode. + * Generic Library for execute GATE DefaultGazetteer and FlexibleGazetter and JAPE rules processing in batch mode. * */ public class App { @@ -50,20 +52,29 @@ public class App { output.setRequired(true); options.addOption(output); - Option listDefinitions = new Option("l", "lists_definitions", true, "Dictionary List definitions. " + Option listDefinitions = new Option("l", "listsURL", true, "Dictionary List definitions. " + "A lists.def Gate-formatted file separated by tab can be provided or a zip file that contains the dictionary/gazetteer files including the lists.def "); listDefinitions.setRequired(false); options.addOption(listDefinitions); + Option gazetterType = new Option("gt", "gazetter_type", true, "Gazetter type: default, flexible. If no value is provided the DefautlGazetter is used"); + gazetterType.setRequired(false); + options.addOption(gazetterType); + + Option inputFeatureNames = new Option("inputFeatureNames", "inputFeatureNames", true, "See flexible gazetter required fields. These feature values are used to replace the corresponding original text. " + + " Default vales are Token.root,Token.word. Format if there is more than one feature: Token.xxx,Token.yyy"); + inputFeatureNames.setRequired(false); + options.addOption(inputFeatureNames); + Option japeMain = new Option("j", "jape_main", true, "Jape Main file for processing rules"); japeMain.setRequired(false); options.addOption(japeMain); - Option set = new Option("a", "annotation_set", true, "Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules"); + Option set = new Option("a", "outputASName", true, "Output Annotation Set. Annotation set where the annotation will be included for the gazetter lookup and for the Jape Rules"); set.setRequired(true); options.addOption(set); - Option iset = new Option("ia", "input_annotation_set", true, "Input Annotation Set. If you want to provided different input annotation set this parameter. By default the -a output annotation set is used as input."); + Option iset = new Option("ia", "inputASName", true, "Input Annotation Set. If you want to provided different input annotation set this parameter. By default the -a output annotation set is used as input."); iset.setRequired(false); options.addOption(iset); @@ -101,7 +112,7 @@ public class App { String outputFilePath = cmd.getOptionValue("output"); String workdirPath = cmd.getOptionValue("workdir"); - String listsDefinitionsPath = cmd.getOptionValue("lists_definitions"); + String listsDefinitionsPath = cmd.getOptionValue("listsURL"); String japeMainPath = cmd.getOptionValue("jape_main"); if (!java.nio.file.Files.isDirectory(Paths.get(cmd.getOptionValue("input")))) { @@ -110,18 +121,29 @@ public class App { } parameters.put("inputFilePath", cmd.getOptionValue("input")); - if (cmd.getOptionValue("annotation_set")==null) { + if (cmd.getOptionValue("gazetter_type")==null) { + System.out.println("The DefaultGazetter will be used"); + parameters.put("gazetter_type", "default"); + }else if(cmd.getOptionValue("gazetter_type").equals("default") || cmd.getOptionValue("gazetter_type").equals("flexible")){ + parameters.put("gazetter_type", cmd.getOptionValue("gazetter_type")); + System.out.println("The Gazetter to be used: " + cmd.getOptionValue("gazetter_type")); + }else { + System.out.println("Wrong Gazetter Type configuration (default or flexible): " + cmd.getOptionValue("gazetter_type")); + } + + parameters.put("inputFeatureNames", cmd.getOptionValue("inputFeatureNames")); + + if (cmd.getOptionValue("outputASName")==null) { System.out.println("Please set the annotation set where the annotation will be included"); System.exit(1); } - parameters.put("annotationSet", cmd.getOptionValue("annotation_set")); - + parameters.put("outputASName", cmd.getOptionValue("outputASName")); - if (cmd.getOptionValue("input_annotation_set")==null) { + if (cmd.getOptionValue("inputASName")==null) { System.out.println("The input annotation set not set, same as output is selected"); - parameters.put("inputAnnotationSet", cmd.getOptionValue("annotation_set")); + parameters.put("inputASName", cmd.getOptionValue("outputASName")); }else { - parameters.put("inputAnnotationSet", cmd.getOptionValue("input_annotation_set")); + parameters.put("inputASName", cmd.getOptionValue("inputASName")); } if (cmd.getOptionValue("gazetteerFeatureSeparator")==null) { @@ -180,7 +202,7 @@ public class App { } - parameters.put("listsDefinitionsPath", listsDefinitionsPath); + parameters.put("listsURL", listsDefinitionsPath); if(japeMainPath==null) { System.out.println("No Jape Main Rules were provided."); @@ -235,7 +257,6 @@ public class App { * @throws IOException */ private static void process(Map parameters) throws GateException, IOException { - // private static void process(String inputDirectory, String outputDirectory, String listsDefinitionsPath, String japeRules, String inputAnnotationSet, String outAnnotationSet, String workdirPath) throws GateException, IOException { try { System.out.println("App :: main :: INIT PROCESS"); Corpus corpus = Factory.newCorpus("My Files"); @@ -245,31 +266,52 @@ public class App { corpus.populate(url, filter, null, false); Plugin anniePlugin = new Plugin.Maven("uk.ac.gate.plugins", "annie", "8.6"); Gate.getCreoleRegister().registerPlugin(anniePlugin); + // create a serial analyser controller to run ANNIE with SerialAnalyserController annieController = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController", Factory.newFeatureMap(), Factory.newFeatureMap(), "ANNIE"); annieController.setCorpus(corpus); - ProcessingResource pr_gazetter = null; - if(parameters.get("listsDefinitionsPath")!=null) { - //Gazetter parameters + if(parameters.get("listsURL")!=null) { + //Basic Gazetter + //Gazetter parameters FeatureMap params = Factory.newFeatureMap(); - params.put("listsURL", new File(parameters.get("listsDefinitionsPath")).toURL()); + params.put("listsURL", new File(parameters.get("listsURL")).toURL()); params.put("gazetteerFeatureSeparator", parameters.get("gazetteerFeatureSeparator")); params.put("caseSensitive",parameters.get("caseSensitive")); params.put("longestMatchOnly",parameters.get("longestMatchOnly")); - pr_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params); - pr_gazetter.setParameterValue("annotationSetName", parameters.get("annotationSet")); - annieController.add(pr_gazetter); + ProcessingResource pr_basic_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params); + + //Flexible Gazetter + if(parameters.get("gazetter_type").equals("flexible")) { + Plugin toolsPlugin = new Plugin.Maven("uk.ac.gate.plugins", "tools", "8.6"); + Gate.getCreoleRegister().registerPlugin(toolsPlugin); + FeatureMap params2 = Factory.newFeatureMap(); + ProcessingResource pr_flexi_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.FlexibleGazetteer", params2); + pr_flexi_gazetter.setParameterValue("inputASName", parameters.get("inputASName")); + if (parameters.get("inputFeatureNames")==null) { + System.out.println("No inputFeatureNames defined, Token.root and Token.word as default"); + ArrayList l = new ArrayList(Arrays.asList("Token.root","Token.word")); + pr_flexi_gazetter.setParameterValue("inputFeatureNames", l); + }else { + ArrayList l = new ArrayList(Arrays.asList(parameters.get("inputFeatureNames").toString().split(","))); + pr_flexi_gazetter.setParameterValue("inputFeatureNames", l); + } + pr_flexi_gazetter.setParameterValue("gazetteerInst", pr_basic_gazetter); + pr_flexi_gazetter.setParameterValue("outputASName", parameters.get("outputASName")); + annieController.add(pr_flexi_gazetter); + }else { //Default Gazetter + pr_basic_gazetter.setParameterValue("annotationSetName", parameters.get("outputASName")); + annieController.add(pr_basic_gazetter); + } } LanguageAnalyser jape = null; if(parameters.get("japeMainPath")!=null) { jape = (LanguageAnalyser)gate.Factory.createResource("gate.creole.Transducer", gate.Utils.featureMap( "grammarURL", new File(parameters.get("japeMainPath")).toURI().toURL(),"encoding", "UTF-8")); - jape.setParameterValue("inputASName", parameters.get("inputAnnotationSet")); - jape.setParameterValue("outputASName", parameters.get("annotationSet")); - + jape.setParameterValue("inputASName", parameters.get("inputASName")); + jape.setParameterValue("outputASName", parameters.get("outputASName")); annieController.add(jape); } @@ -277,9 +319,9 @@ public class App { annieController.execute(); //free resources - if(pr_gazetter!=null) { - Factory.deleteResource(pr_gazetter); - } +// if(pr_gazetter!=null) { +// Factory.deleteResource(pr_gazetter); +// } if(jape!=null) { Factory.deleteResource(jape); } -- GitLab