Commit f2a5e779 authored by javi's avatar javi
Browse files

Extract abbreviations option and Two lists for gazetter lookup option

parent 40fef088
Pipeline #21160 passed with stage
in 2 minutes and 2 seconds
......@@ -185,8 +185,9 @@ public class App {
System.out.println("Dictionary Path " + listsDefinitionsPath);
execution = true;
if (!java.nio.file.Files.isRegularFile(Paths.get(listsDefinitionsPath))) {
System.out.println("Please set the list of dictionaries to annotate. You can provide the list.def file or a zip file. Please if you provided a zip file remember that it must contain a list.def file inside");
System.exit(1);
//System.out.println("Please set the list of dictionaries to annotate. You can provide the list.def file or a zip file. Please if you provided a zip file remember that it must contain a list.def file inside");
//System.exit(1);
//remember to validate with comma separated
}
if(listsDefinitionsPath.endsWith(".zip")) {
......
......@@ -57,38 +57,42 @@ public class Process implements Runnable {
annieController = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController",
Factory.newFeatureMap(), Factory.newFeatureMap(), "ANNIE");
if(parameters.get("listsURL")!=null) {
//Basic Gazetter
//Gazetter parameters
FeatureMap params = Factory.newFeatureMap();
params.put("listsURL", new File(parameters.get("listsURL")).toURL());
params.put("gazetteerFeatureSeparator", parameters.get("gazetteerFeatureSeparator"));
params.put("caseSensitive",parameters.get("caseSensitive"));
params.put("longestMatchOnly",parameters.get("longestMatchOnly"));
ProcessingResource pr_basic_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params);
//Flexible Gazetter
if(parameters.get("gazetter_type").equals("flexible")) {
Plugin toolsPlugin = new Plugin.Maven("uk.ac.gate.plugins", "tools", "8.6");
Gate.getCreoleRegister().registerPlugin(toolsPlugin);
FeatureMap params2 = Factory.newFeatureMap();
ProcessingResource pr_flexi_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.FlexibleGazetteer", params2);
pr_flexi_gazetter.setParameterValue("inputASName", parameters.get("inputASName"));
if (parameters.get("inputFeatureNames")==null) {
System.out.println("No inputFeatureNames defined, Token.root and Token.word as default");
ArrayList<String> l = new ArrayList<String>(Arrays.asList("Token.root","Token.word"));
pr_flexi_gazetter.setParameterValue("inputFeatureNames", l);
}else {
ArrayList<String> l = new ArrayList<String>(Arrays.asList(parameters.get("inputFeatureNames").toString().split(",")));
pr_flexi_gazetter.setParameterValue("inputFeatureNames", l);
String listURL = parameters.get("listsURL");
String[] main_files = listURL.split(",");
for (String main_file : main_files) {
//Basic Gazetter
//Gazetter parameters
FeatureMap params = Factory.newFeatureMap();
params.put("listsURL", new File(main_file).toURL());
params.put("gazetteerFeatureSeparator", parameters.get("gazetteerFeatureSeparator"));
params.put("caseSensitive",parameters.get("caseSensitive"));
params.put("longestMatchOnly",parameters.get("longestMatchOnly"));
ProcessingResource pr_basic_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params);
//Flexible Gazetter
if(parameters.get("gazetter_type").equals("flexible")) {
Plugin toolsPlugin = new Plugin.Maven("uk.ac.gate.plugins", "tools", "8.6");
Gate.getCreoleRegister().registerPlugin(toolsPlugin);
FeatureMap params2 = Factory.newFeatureMap();
ProcessingResource pr_flexi_gazetter = (ProcessingResource) Factory.createResource("gate.creole.gazetteer.FlexibleGazetteer", params2);
pr_flexi_gazetter.setParameterValue("inputASName", parameters.get("inputASName"));
if (parameters.get("inputFeatureNames")==null) {
System.out.println("No inputFeatureNames defined, Token.root and Token.word as default");
ArrayList<String> l = new ArrayList<String>(Arrays.asList("Token.root","Token.word"));
pr_flexi_gazetter.setParameterValue("inputFeatureNames", l);
}else {
ArrayList<String> l = new ArrayList<String>(Arrays.asList(parameters.get("inputFeatureNames").toString().split(",")));
pr_flexi_gazetter.setParameterValue("inputFeatureNames", l);
}
pr_flexi_gazetter.setParameterValue("gazetteerInst", pr_basic_gazetter);
pr_flexi_gazetter.setParameterValue("outputASName", parameters.get("outputASName"));
annieController.add(pr_flexi_gazetter);
}else { //Default Gazetter
pr_basic_gazetter.setParameterValue("annotationSetName", parameters.get("outputASName"));
annieController.add(pr_basic_gazetter);
}
pr_flexi_gazetter.setParameterValue("gazetteerInst", pr_basic_gazetter);
pr_flexi_gazetter.setParameterValue("outputASName", parameters.get("outputASName"));
annieController.add(pr_flexi_gazetter);
}else { //Default Gazetter
pr_basic_gazetter.setParameterValue("annotationSetName", parameters.get("outputASName"));
annieController.add(pr_basic_gazetter);
}
}
}
}
LanguageAnalyser jape = null;
if(parameters.get("japeMainPath")!=null) {
......
package es.bsc.inb.nlp.gate.generic.component.util;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Vector;
/**
*
* An alternative execution method for the ExtractAbbrev from A.S. Schwartz, M.A. Hearst.
*
* The ExtractAbbrev class implements a simple algorithm for extraction of abbreviations and their
* definitions from biomedical text. extraction of abbreviations and their definitions from biomedical text.
* Abbreviations (short forms) are extracted from the input file, and those abbreviations
* for which a definition (long form) is found are printed out, along with that definition,
* one per line.
*
* The original definition of short abbreviations described in the following paper take into account only abbreviations inside parenthesis: i.e. ('ABBRE');
* this is not always the case found inside articles. Here we identify candidate abbreviations with the pattern "(?:[a-z]*[A-Z][a-z]*){2,}"
* no mater if they are not inside parenthesis.
*
*
* @see <a href="http://biotext.berkeley.edu/papers/psb03.pdf">A Simple Algorithm for Identifying Abbreviation Definitions in Biomedical Text</a>
* A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing 8:451-462(2003)
* for a detailed description of the algorithm.
*
* @author Javier Corvi
* @version 08/03/2021
*/
public class ExtractAbbrev {
HashMap mTestDefinitions = new HashMap();
HashMap mStats = new HashMap();
int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0;
char delimiter = '\t';
boolean testMode = false;
private boolean isValidShortForm(String str) {
return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str.charAt(0) == '(')));
}
private boolean hasLetter(String str) {
for (int i=0; i < str.length() ; i++)
if (Character.isLetter(str.charAt(i)))
return true;
return false;
}
private boolean hasCapital(String str) {
for (int i=0; i < str.length() ; i++)
if (Character.isUpperCase(str.charAt(i)))
return true;
return false;
}
private void loadTrueDefinitions(String inFile) {
String abbrString, defnString, str = "";
Vector entry;
HashMap definitions = mTestDefinitions;
try {
BufferedReader fin = new BufferedReader(new FileReader (inFile));
while ((str = fin.readLine()) != null) {
int j = str.indexOf(delimiter);
abbrString = str.substring(0,j).trim();
defnString = str.substring(j,str.length()).trim();
entry = (Vector)definitions.get(abbrString);
if (entry == null)
entry = new Vector();
entry.add(defnString);
definitions.put(abbrString, entry);
}
} catch (Exception e) {
e.printStackTrace();
System.out.println(str);
}
}
private boolean isTrueDefinition(String shortForm, String longForm) {
Vector entry;
Iterator itr;
entry = (Vector)mTestDefinitions.get(shortForm);
if (entry == null)
return false;
itr = entry.iterator();
while(itr.hasNext()){
if (itr.next().toString().equalsIgnoreCase(longForm))
return true;
}
return false;
}
private Vector extractAbbrPairs(String inFile) {
String str, tmpStr, longForm = "", shortForm = "";
String currSentence = "";
int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1;
boolean newParagraph = true;
StringTokenizer shortTokenizer;
Vector candidates = new Vector();
try {
BufferedReader fin = new BufferedReader(new FileReader (inFile));
while ((str = fin.readLine()) != null) {
if (str.length() == 0 || newParagraph &&
! Character.isUpperCase(str.charAt(0))){
currSentence = "";
newParagraph = true;
continue;
}
newParagraph = false;
str += " ";
currSentence += str;
openParenIndex = currSentence.indexOf(" (");
do {
if (openParenIndex > -1)
openParenIndex++;
sentenceEnd = Math.max(currSentence.lastIndexOf(". "), currSentence.lastIndexOf(", "));
if ((openParenIndex == -1) && (sentenceEnd == -1)) {
//Do nothing
}
else if (openParenIndex == -1) {
currSentence = currSentence.substring(sentenceEnd + 2);
} else if ((closeParenIndex = currSentence.indexOf(')',openParenIndex)) > -1){
sentenceEnd = Math.max(currSentence.lastIndexOf(". ", openParenIndex),
currSentence.lastIndexOf(", ", openParenIndex));
if (sentenceEnd == -1)
sentenceEnd = -2;
longForm = currSentence.substring(sentenceEnd + 2, openParenIndex);
shortForm = currSentence.substring(openParenIndex + 1, closeParenIndex);
}
if (shortForm.length() > 0 || longForm.length() > 0) {
if (shortForm.length() > 1 && longForm.length() > 1) {
if ((shortForm.indexOf('(') > -1) &&
((newCloseParenIndex = currSentence.indexOf(')', closeParenIndex + 1)) > -1)){
shortForm = currSentence.substring(openParenIndex + 1, newCloseParenIndex);
closeParenIndex = newCloseParenIndex;
}
if ((tmpIndex = shortForm.indexOf(", ")) > -1)
shortForm = shortForm.substring(0, tmpIndex);
if ((tmpIndex = shortForm.indexOf("; ")) > -1)
shortForm = shortForm.substring(0, tmpIndex);
shortTokenizer = new StringTokenizer(shortForm);
if (shortTokenizer.countTokens() > 2 || shortForm.length() > longForm.length()) {
// Long form in ( )
tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2);
tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1);
longForm = shortForm;
shortForm = tmpStr;
if (! hasCapital(shortForm))
shortForm = "";
}
if (isValidShortForm(shortForm)){
extractAbbrPair(shortForm.trim(), longForm.trim());
}
}
currSentence = currSentence.substring(closeParenIndex + 1);
} else if (openParenIndex > -1) {
if ((currSentence.length() - openParenIndex) > 200)
// Matching close paren was not found
currSentence = currSentence.substring(openParenIndex + 1);
break; // Read next line
}
shortForm = "";
longForm = "";
} while ((openParenIndex = currSentence.indexOf(" (")) > -1);
}
fin.close();
} catch (Exception ioe) {
ioe.printStackTrace();
System.out.println(currSentence);
System.out.println(tmpIndex);
}
return candidates;
}
private String findBestLongForm(String shortForm, String longForm) {
int sIndex;
int lIndex;
char currChar;
sIndex = shortForm.length() - 1;
lIndex = longForm.length() - 1;
for ( ; sIndex >= 0; sIndex--) {
currChar = Character.toLowerCase(shortForm.charAt(sIndex));
if (!Character.isLetterOrDigit(currChar))
continue;
while (((lIndex >= 0) && (Character.toLowerCase(longForm.charAt(lIndex)) != currChar)) ||
((sIndex == 0) && (lIndex > 0) && (Character.isLetterOrDigit(longForm.charAt(lIndex - 1)))))
lIndex--;
if (lIndex < 0)
return null;
lIndex--;
}
lIndex = longForm.lastIndexOf(" ", lIndex) + 1;
return longForm.substring(lIndex);
}
public String extractAbbrPair(String shortForm, String longForm) {
String bestLongForm;
StringTokenizer tokenizer;
int longFormSize, shortFormSize;
if (shortForm.length() == 1)
return "";
bestLongForm = findBestLongForm(shortForm, longForm);
if (bestLongForm == null)
return "";
tokenizer = new StringTokenizer(bestLongForm, " \t\n\r\f-");
longFormSize = tokenizer.countTokens();
shortFormSize = shortForm.length();
for (int i=shortFormSize - 1; i >= 0; i--)
if (!Character.isLetterOrDigit(shortForm.charAt(i)))
shortFormSize--;
if (bestLongForm.length() < shortForm.length() ||
bestLongForm.indexOf(shortForm + " ") > -1 ||
bestLongForm.endsWith(shortForm) ||
longFormSize > shortFormSize * 2 ||
longFormSize > shortFormSize + 5 ||
shortFormSize > 10)
return "";
System.out.println(shortForm + delimiter + bestLongForm);
return bestLongForm;
}
private static void usage() {
System.err.println("Usage: ExtractAbbrev [-options] <filename>");
System.err.println(" <filename> contains text from which abbreviations are extracted" );
System.err.println(" -testlist <file> = list of true abbreviation definition pairs");
System.err.println(" -usage or -help = this message");
System.exit(1);
}
public static void main(String[] args) {
String shortForm, longForm, defnString, str;
ExtractAbbrev extractAbbrev = new ExtractAbbrev();
Vector candidates;
String[] candidate;
String filename = null;
String testList = null;
//parse arguments
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-testlist")) {
if (i == args.length - 1) {
usage();
}
testList = args[++i];
extractAbbrev.testMode = true;
} else if (args[i].equals("-usage")) {
usage();
} else if (args[i].equals("-help")) {
usage();
} else {
filename = args[i];
// Must be last arg
if (i != args.length - 1) {
usage();
}
}
}
if (filename == null) {
usage();
}
if (extractAbbrev.testMode)
extractAbbrev.loadTrueDefinitions(testList);
extractAbbrev.extractAbbrPairs(filename);
if (extractAbbrev.testMode)
System.out.println("TP: " + extractAbbrev.truePositives + " FP: " + extractAbbrev.falsePositives +
" FN: " + extractAbbrev.falseNegatives + " TN: " + extractAbbrev.trueNegatives);
}
public static Map<String,String> mainAlternative(String sentence) {
// sentence = "We investigated the potential of\n" +
// "human bone marrow derived Mesenchymal stem cells MSCs for neuronal\n" +
// "differentiation in vitro on poly L lactic acid co poly 3 caprolactone /Collagen\n" +
// "PLCL/Coll nanofibrous scaffolds.";
ExtractAbbrev extract = new ExtractAbbrev();
java.util.regex.Matcher matcher = java.util.regex.Pattern.compile("(?:[a-z]*[A-Z][a-z]*){2,}").matcher(sentence);
Map<String,String> map = new HashMap<String,String>();
while (matcher.find()) {
String candidate = matcher.group();
String definition = extract.extractAbbrPair(candidate, sentence.substring(0,matcher.start()));
if(definition!=null && !definition.trim().equals("")) {
map.put(candidate, definition);
}
}
return map;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment