diff --git a/pretox_classificator_sentences.ser b/pretox_classificator_sentences.ser index b88712550d3ad1dd78c8553d312cfbc56394f756..a72da11d8af50cfe28db5252ea64b2bf58ff6f3e 100644 Binary files a/pretox_classificator_sentences.ser and b/pretox_classificator_sentences.ser differ diff --git a/src/main/java/es/bsc/inb/nlp/classifier/main/App.java b/src/main/java/es/bsc/inb/nlp/classifier/main/App.java index 7d78337c3afa49351aaab9fc7d6e15611c5972b2..c609aa4c50af918e01a670bd4cc1982d0cecc9c6 100644 --- a/src/main/java/es/bsc/inb/nlp/classifier/main/App.java +++ b/src/main/java/es/bsc/inb/nlp/classifier/main/App.java @@ -70,6 +70,10 @@ public class App { threads_option.setRequired(false); options.addOption(threads_option); + Option relevant_class_option = new Option("c", "relevant_class", true, "Relevant Class. The positive relevant classification string of the sentence. By default is PRETOX_REL"); + relevant_class_option.setRequired(false); + options.addOption(relevant_class_option); + CommandLineParser parser = new DefaultParser(); HelpFormatter formatter = new HelpFormatter(); CommandLine cmd = null; @@ -87,7 +91,7 @@ public class App { String annotationSet = cmd.getOptionValue("annotation_set"); String modelPath = cmd.getOptionValue("model"); String threads_str = cmd.getOptionValue("threads"); - + String relevant_class = cmd.getOptionValue("relevant_class"); if (!java.nio.file.Files.isDirectory(Paths.get(inputFilePath))) { System.out.println("Please set the inputDirectoryPath "); System.exit(1); @@ -134,6 +138,10 @@ public class App { modelPath = workdirPath + "pretox_classificator_sentences.ser"; } + if(relevant_class==null) { + relevant_class = "PRETOX_REL"; + } + ColumnDataClassifier model = null; try { ByteArrayInputStream bais = new ByteArrayInputStream(FileUtils.readFileToByteArray(new File(modelPath))); @@ -155,7 +163,7 @@ public class App { } try { - process(threads, inputFilePath, processedFiles, outputFilePath, workdirPath, annotationSet, model); + process(threads, inputFilePath, processedFiles, outputFilePath, workdirPath, annotationSet, model, relevant_class); } catch (IOException e) { e.printStackTrace(); } @@ -166,7 +174,7 @@ public class App { * @param properties_parameters_path * @throws IOException */ - public static void process(Integer threads, String inputDirectoryPath,Set processedFiles,String outputDirectoryPath, String workdir, String annotationSet, ColumnDataClassifier model) throws IOException { + public static void process(Integer threads, String inputDirectoryPath,Set processedFiles,String outputDirectoryPath, String workdir, String annotationSet, ColumnDataClassifier model, String relevant_class) throws IOException { System.out.println("App :: process :: INIT "); if (java.nio.file.Files.isDirectory(Paths.get(inputDirectoryPath))) { File inputDirectory = new File(inputDirectoryPath); @@ -196,7 +204,7 @@ public class App { List futuresList = new ArrayList(); ExecutorService eservice = Executors.newFixedThreadPool(10); for(int index = 0; index < threads; index++) - futuresList.add(eservice.submit(new Process(index, list.get(index), processedFiles, outputDirectoryPath, annotationSet, model))); + futuresList.add(eservice.submit(new Process(index, list.get(index), processedFiles, outputDirectoryPath, annotationSet, model, relevant_class))); Object taskResult; for(Future future:futuresList) { diff --git a/src/main/java/es/bsc/inb/nlp/classifier/main/Process.java b/src/main/java/es/bsc/inb/nlp/classifier/main/Process.java index 79670c1619d10cff22917abbab1b5468591ebba8..3e3cecd0fb0a2f16c643dd088c828ec8247a02f3 100644 --- a/src/main/java/es/bsc/inb/nlp/classifier/main/Process.java +++ b/src/main/java/es/bsc/inb/nlp/classifier/main/Process.java @@ -14,6 +14,7 @@ import edu.stanford.nlp.ling.Datum; import gate.Annotation; import gate.AnnotationSet; import gate.Factory; +import gate.FeatureMap; import gate.creole.ResourceInstantiationException; import gate.util.InvalidOffsetException; @@ -38,7 +39,9 @@ public class Process implements Runnable { ColumnDataClassifier model = null; - public Process(int name, File[] files, Set processedFiles,String outputPath, String annotationSet, ColumnDataClassifier model) { + String relevant_class = null; + + public Process(int name, File[] files, Set processedFiles,String outputPath, String annotationSet, ColumnDataClassifier model, String relevant_class) { super(); this.name=name; this.files = files; @@ -46,6 +49,7 @@ public class Process implements Runnable { this.annotationSet = annotationSet; this.outputPath = outputPath; this.model = model; + this.relevant_class = relevant_class; } @@ -59,7 +63,7 @@ public class Process implements Runnable { fileOutPutName = fileOutPutName.replace(".txt", ".xml"); } File outputGATEFile = new File (outputPath + File.separator + fileOutPutName); - processDocument(file, annotationSet, model, outputGATEFile); + processDocument(file, annotationSet, model, outputGATEFile, relevant_class); fileOutPutName=null; outputGATEFile=null; } catch (ResourceInstantiationException e) { @@ -86,7 +90,7 @@ public class Process implements Runnable { * @throws MalformedURLException * @throws InvalidOffsetException */ - private static void processDocument(File file, String annotationSet, ColumnDataClassifier model, File outputGATEFile) throws ResourceInstantiationException, MalformedURLException, InvalidOffsetException { + private static void processDocument(File file, String annotationSet, ColumnDataClassifier model, File outputGATEFile, String relevant_class) throws ResourceInstantiationException, MalformedURLException, InvalidOffsetException { try { gate.Document gateDocument = Factory.newDocument(file.toURI().toURL(), "UTF-8"); AnnotationSet as = gateDocument.getAnnotations(annotationSet); @@ -95,9 +99,24 @@ public class Process implements Runnable { for (Annotation sentence : sentences) { String str_sentence = gate.Utils.stringFor(gateDocument, sentence); str_sentence = str_sentence.replaceAll("\\R+", " "); - Datum d = model.makeDatumFromLine("\t" + str_sentence); - String relevant_class = model.classOf(d); - sentence.getFeatures().put("RELEVANT", relevant_class); + StringBuilder features_str = new StringBuilder(); + FeatureMap features = sentence.getFeatures(); + for (Object feature_name : features.keySet()) { + if(feature_name.toString().startsWith("f_")) { + features_str.append(features.get(feature_name)); + features_str.append("\t"); + } + } + String s = features_str.length() > 0 ? features_str.substring(0, features_str.length() - 1): ""; + Datum d = model.makeDatumFromLine("\t" + str_sentence + "\t" + s); + String classificationLabel = model.classOf(d); + sentence.getFeatures().put("RELEVANT", classificationLabel); + if(classificationLabel.equals(relevant_class)) { + gate.FeatureMap featuresRelevant = Factory.newFeatureMap(); + featuresRelevant.put("CLASS", relevant_class); + gateDocument.getAnnotations(annotationSet).add(sentence.getStartNode().getOffset(), sentence.getEndNode().getOffset(), relevant_class, featuresRelevant); + featuresRelevant=null; + } str_sentence = null; d=null; relevant_class=null;