Commit 3dd333b9 authored by javi's avatar javi
Browse files

splitting the information in annotations file and text file

parent 074d193f
......@@ -37,7 +37,7 @@
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
</dependencies>
<build>
<plugins>
......
......@@ -4,8 +4,9 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
......@@ -24,6 +25,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.maven.shared.utils.io.FileUtils;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.google.gson.Gson;
......@@ -112,6 +114,14 @@ public class App {
if(!outputDirectory.exists())
outputDirectory.mkdirs();
Set<String> processedFiles = null;
try {
processedFiles = getFiles(outputFilePath);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
Gate.init();
} catch (GateException e) {
......@@ -125,7 +135,7 @@ public class App {
}
try {
process(inputFilePath, outputFilePath,workdirPath, annotationSet, annotationSetRelationExtraction);
process(inputFilePath, outputFilePath,workdirPath, annotationSet, annotationSetRelationExtraction, processedFiles);
} catch (IOException e) {
e.printStackTrace();
}
......@@ -136,18 +146,21 @@ public class App {
* @param properties_parameters_path
* @throws IOException
*/
public static void process(String inputDirectoryPath, String outputDirectoryPath, String workdir, String annotationSet, String annotationSetRelationExtraction) throws IOException {
public static void process(String inputDirectoryPath, String outputDirectoryPath, String workdir, String annotationSet, String annotationSetRelationExtraction, Set<String> processedFiles) throws IOException {
System.out.println("App::processTagger :: INIT ");
if (java.nio.file.Files.isDirectory(Paths.get(inputDirectoryPath))) {
File inputDirectory = new File(inputDirectoryPath);
File[] files = inputDirectory.listFiles();
System.out.println("Total files : " + files.length);
System.out.println("Files already processed : " + processedFiles.size());
for (File file : files) {
if(file.getName().endsWith(".xml")){
if(file.getName().endsWith(".xml") && !processedFiles.contains(FileUtils.removeExtension(file.getName()))){
try {
System.out.println("App::process :: processing file : " + file.getAbsolutePath());
String fileOutPutName = file.getName().replace(".xml", ".json");
File outputGATEFile = new File (outputDirectoryPath + File.separator + fileOutPutName);
processDocument(file, outputGATEFile, annotationSet, annotationSetRelationExtraction);
String fileOutPutName = file.getName();
File outputAbstractFile = new File (outputDirectoryPath + File.separator + fileOutPutName.replace(".xml", "_abstract.json"));
File outputAnnotationsFile = new File (outputDirectoryPath + File.separator + fileOutPutName.replace(".xml", "_annotations.json"));
processDocument(file, outputAbstractFile, outputAnnotationsFile, annotationSet, annotationSetRelationExtraction);
} catch (ResourceInstantiationException e) {
System.out.println("App::process :: error with document " + file.getAbsolutePath());
e.printStackTrace();
......@@ -178,15 +191,19 @@ public class App {
* @throws JsonGenerationException
* @throws InvalidOffsetException
*/
private static void processDocument(File inputFile, File outputGATEFile, String annotationSet, String annotationSetRelationExtraction ) throws ResourceInstantiationException, JsonGenerationException, IOException, InvalidOffsetException{
private static void processDocument(File inputFile, File outputTextFile, File outputAnnotationsFile, String annotationSet, String annotationSetRelationExtraction ) throws ResourceInstantiationException, JsonGenerationException, IOException, InvalidOffsetException{
gate.Document doc = Factory.newDocument(inputFile.toURI().toURL(), "UTF-8");
Gson gsonBuilder = new GsonBuilder().create();
JsonObject document = new JsonObject();
document.addProperty("name", doc.getName().substring(0, doc.getName().indexOf(".xml")+4));
JsonObject annotated_document = new JsonObject();
long id_document = System.currentTimeMillis();
String name = doc.getName().substring(0, doc.getName().indexOf(".xml")+4);
String plainText = doc.getContent().getContent(0l, gate.Utils.lengthLong(doc)).toString();
document.addProperty("text", plainText);
document.addProperty("textWithAnnotations", new String(Files.readAllBytes(inputFile.getAbsoluteFile().toPath()), StandardCharsets.UTF_8));
document.addProperty("id", System.currentTimeMillis());
//to include also the gate annotated document as text
//document.addProperty("textWithAnnotations", new String(Files.readAllBytes(inputFile.getAbsoluteFile().toPath()), StandardCharsets.UTF_8));
annotated_document.addProperty("id", id_document);
annotated_document.addProperty("name", name);
JsonObject section = new JsonObject();
section.addProperty("name", "document");
Set<String> types = Stream.of("FINDING","SEX","SPECIMEN","GROUP","DOSE","MANIFESTATION_FINDING","RISK_LEVEL","NO_TREATMENT_RELATED_TRIGGER",
......@@ -214,7 +231,7 @@ public class App {
entities.add(type, type_array);
}
document.add("annotations", entities);
annotated_document.add("annotations", entities);
JsonArray findings = new JsonArray();
AnnotationSet as2 = doc.getAnnotations(annotationSetRelationExtraction);
int id = 0;
......@@ -262,10 +279,27 @@ public class App {
}
findings.add(findingObject);
}
document.add("findings", findings);
java.io.Writer out = new java.io.BufferedWriter(new java.io.OutputStreamWriter(new FileOutputStream(outputGATEFile, false)));
out.write(gsonBuilder.toJson(document));
out.close();
annotated_document.add("findings", findings);
//write the annotations to file annotations
java.io.Writer writer1 = new java.io.BufferedWriter(new java.io.OutputStreamWriter(new FileOutputStream(outputAnnotationsFile, false)));
writer1.write(gsonBuilder.toJson(annotated_document));
writer1.flush();
writer1.close();
writer1 = null;
annotated_document=null;
//document text
JsonObject text_document = new JsonObject();
text_document.addProperty("id", id_document);
text_document.addProperty("name", name);
text_document.addProperty("text", plainText);
//write to file document text
java.io.Writer writer2 = new java.io.BufferedWriter(new java.io.OutputStreamWriter(new FileOutputStream(outputTextFile, false)));
writer2.write(gsonBuilder.toJson(text_document));
writer2.flush();
writer2.close();
writer2=null;
text_document = null;
}
......@@ -275,4 +309,22 @@ public class App {
Collections.sort(mainList, new NumberAwareStringComparator());
return mainList;
}
/**
* Return a set of files
* @param dir
* @return
* @throws IOException
*/
public static Set<String> getFiles(String dir) throws IOException {
Set<String> fileList = new HashSet<>();
try (DirectoryStream<Path> stream = Files.newDirectoryStream(Paths.get(dir))) {
for (Path path : stream) {
if (!Files.isDirectory(path)) {
fileList.add(FileUtils.removeExtension(path.getFileName().toString()));
}
}
}
return fileList;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment