Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
INB
eTRANSAFE
pretox-sr-domain-identification
Commits
3dd333b9
Commit
3dd333b9
authored
Sep 02, 2020
by
javi
Browse files
splitting the information in annotations file and text file
parent
074d193f
Changes
2
Hide whitespace changes
Inline
Side-by-side
pom.xml
View file @
3dd333b9
...
...
@@ -37,7 +37,7 @@
<artifactId>
commons-cli
</artifactId>
<version>
1.4
</version>
</dependency>
</dependencies>
</dependencies>
<build>
<plugins>
...
...
src/main/java/es/bsc/inb/ades/export/json/main/App.java
View file @
3dd333b9
...
...
@@ -4,8 +4,9 @@ import java.io.File;
import
java.io.FileOutputStream
;
import
java.io.IOException
;
import
java.net.MalformedURLException
;
import
java.nio.
charset.StandardCharsets
;
import
java.nio.
file.DirectoryStream
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.util.ArrayList
;
import
java.util.Collections
;
...
...
@@ -24,6 +25,7 @@ import org.apache.commons.cli.HelpFormatter;
import
org.apache.commons.cli.Option
;
import
org.apache.commons.cli.Options
;
import
org.apache.commons.cli.ParseException
;
import
org.apache.maven.shared.utils.io.FileUtils
;
import
com.fasterxml.jackson.core.JsonGenerationException
;
import
com.google.gson.Gson
;
...
...
@@ -112,6 +114,14 @@ public class App {
if
(!
outputDirectory
.
exists
())
outputDirectory
.
mkdirs
();
Set
<
String
>
processedFiles
=
null
;
try
{
processedFiles
=
getFiles
(
outputFilePath
);
}
catch
(
IOException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
try
{
Gate
.
init
();
}
catch
(
GateException
e
)
{
...
...
@@ -125,7 +135,7 @@ public class App {
}
try
{
process
(
inputFilePath
,
outputFilePath
,
workdirPath
,
annotationSet
,
annotationSetRelationExtraction
);
process
(
inputFilePath
,
outputFilePath
,
workdirPath
,
annotationSet
,
annotationSetRelationExtraction
,
processedFiles
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
...
...
@@ -136,18 +146,21 @@ public class App {
* @param properties_parameters_path
* @throws IOException
*/
public
static
void
process
(
String
inputDirectoryPath
,
String
outputDirectoryPath
,
String
workdir
,
String
annotationSet
,
String
annotationSetRelationExtraction
)
throws
IOException
{
public
static
void
process
(
String
inputDirectoryPath
,
String
outputDirectoryPath
,
String
workdir
,
String
annotationSet
,
String
annotationSetRelationExtraction
,
Set
<
String
>
processedFiles
)
throws
IOException
{
System
.
out
.
println
(
"App::processTagger :: INIT "
);
if
(
java
.
nio
.
file
.
Files
.
isDirectory
(
Paths
.
get
(
inputDirectoryPath
)))
{
File
inputDirectory
=
new
File
(
inputDirectoryPath
);
File
[]
files
=
inputDirectory
.
listFiles
();
System
.
out
.
println
(
"Total files : "
+
files
.
length
);
System
.
out
.
println
(
"Files already processed : "
+
processedFiles
.
size
());
for
(
File
file
:
files
)
{
if
(
file
.
getName
().
endsWith
(
".xml"
)){
if
(
file
.
getName
().
endsWith
(
".xml"
)
&&
!
processedFiles
.
contains
(
FileUtils
.
removeExtension
(
file
.
getName
()))
){
try
{
System
.
out
.
println
(
"App::process :: processing file : "
+
file
.
getAbsolutePath
());
String
fileOutPutName
=
file
.
getName
().
replace
(
".xml"
,
".json"
);
File
outputGATEFile
=
new
File
(
outputDirectoryPath
+
File
.
separator
+
fileOutPutName
);
processDocument
(
file
,
outputGATEFile
,
annotationSet
,
annotationSetRelationExtraction
);
String
fileOutPutName
=
file
.
getName
();
File
outputAbstractFile
=
new
File
(
outputDirectoryPath
+
File
.
separator
+
fileOutPutName
.
replace
(
".xml"
,
"_abstract.json"
));
File
outputAnnotationsFile
=
new
File
(
outputDirectoryPath
+
File
.
separator
+
fileOutPutName
.
replace
(
".xml"
,
"_annotations.json"
));
processDocument
(
file
,
outputAbstractFile
,
outputAnnotationsFile
,
annotationSet
,
annotationSetRelationExtraction
);
}
catch
(
ResourceInstantiationException
e
)
{
System
.
out
.
println
(
"App::process :: error with document "
+
file
.
getAbsolutePath
());
e
.
printStackTrace
();
...
...
@@ -178,15 +191,19 @@ public class App {
* @throws JsonGenerationException
* @throws InvalidOffsetException
*/
private
static
void
processDocument
(
File
inputFile
,
File
output
GATE
File
,
String
annotationSet
,
String
annotationSetRelationExtraction
)
throws
ResourceInstantiationException
,
JsonGenerationException
,
IOException
,
InvalidOffsetException
{
private
static
void
processDocument
(
File
inputFile
,
File
output
TextFile
,
File
outputAnnotations
File
,
String
annotationSet
,
String
annotationSetRelationExtraction
)
throws
ResourceInstantiationException
,
JsonGenerationException
,
IOException
,
InvalidOffsetException
{
gate
.
Document
doc
=
Factory
.
newDocument
(
inputFile
.
toURI
().
toURL
(),
"UTF-8"
);
Gson
gsonBuilder
=
new
GsonBuilder
().
create
();
JsonObject
document
=
new
JsonObject
();
document
.
addProperty
(
"name"
,
doc
.
getName
().
substring
(
0
,
doc
.
getName
().
indexOf
(
".xml"
)+
4
));
JsonObject
annotated_document
=
new
JsonObject
();
long
id_document
=
System
.
currentTimeMillis
();
String
name
=
doc
.
getName
().
substring
(
0
,
doc
.
getName
().
indexOf
(
".xml"
)+
4
);
String
plainText
=
doc
.
getContent
().
getContent
(
0
l
,
gate
.
Utils
.
lengthLong
(
doc
)).
toString
();
document
.
addProperty
(
"text"
,
plainText
);
document
.
addProperty
(
"textWithAnnotations"
,
new
String
(
Files
.
readAllBytes
(
inputFile
.
getAbsoluteFile
().
toPath
()),
StandardCharsets
.
UTF_8
));
document
.
addProperty
(
"id"
,
System
.
currentTimeMillis
());
//to include also the gate annotated document as text
//document.addProperty("textWithAnnotations", new String(Files.readAllBytes(inputFile.getAbsoluteFile().toPath()), StandardCharsets.UTF_8));
annotated_document
.
addProperty
(
"id"
,
id_document
);
annotated_document
.
addProperty
(
"name"
,
name
);
JsonObject
section
=
new
JsonObject
();
section
.
addProperty
(
"name"
,
"document"
);
Set
<
String
>
types
=
Stream
.
of
(
"FINDING"
,
"SEX"
,
"SPECIMEN"
,
"GROUP"
,
"DOSE"
,
"MANIFESTATION_FINDING"
,
"RISK_LEVEL"
,
"NO_TREATMENT_RELATED_TRIGGER"
,
...
...
@@ -214,7 +231,7 @@ public class App {
entities
.
add
(
type
,
type_array
);
}
document
.
add
(
"annotations"
,
entities
);
annotated_
document
.
add
(
"annotations"
,
entities
);
JsonArray
findings
=
new
JsonArray
();
AnnotationSet
as2
=
doc
.
getAnnotations
(
annotationSetRelationExtraction
);
int
id
=
0
;
...
...
@@ -262,10 +279,27 @@ public class App {
}
findings
.
add
(
findingObject
);
}
document
.
add
(
"findings"
,
findings
);
java
.
io
.
Writer
out
=
new
java
.
io
.
BufferedWriter
(
new
java
.
io
.
OutputStreamWriter
(
new
FileOutputStream
(
outputGATEFile
,
false
)));
out
.
write
(
gsonBuilder
.
toJson
(
document
));
out
.
close
();
annotated_document
.
add
(
"findings"
,
findings
);
//write the annotations to file annotations
java
.
io
.
Writer
writer1
=
new
java
.
io
.
BufferedWriter
(
new
java
.
io
.
OutputStreamWriter
(
new
FileOutputStream
(
outputAnnotationsFile
,
false
)));
writer1
.
write
(
gsonBuilder
.
toJson
(
annotated_document
));
writer1
.
flush
();
writer1
.
close
();
writer1
=
null
;
annotated_document
=
null
;
//document text
JsonObject
text_document
=
new
JsonObject
();
text_document
.
addProperty
(
"id"
,
id_document
);
text_document
.
addProperty
(
"name"
,
name
);
text_document
.
addProperty
(
"text"
,
plainText
);
//write to file document text
java
.
io
.
Writer
writer2
=
new
java
.
io
.
BufferedWriter
(
new
java
.
io
.
OutputStreamWriter
(
new
FileOutputStream
(
outputTextFile
,
false
)));
writer2
.
write
(
gsonBuilder
.
toJson
(
text_document
));
writer2
.
flush
();
writer2
.
close
();
writer2
=
null
;
text_document
=
null
;
}
...
...
@@ -275,4 +309,22 @@ public class App {
Collections
.
sort
(
mainList
,
new
NumberAwareStringComparator
());
return
mainList
;
}
/**
* Return a set of files
* @param dir
* @return
* @throws IOException
*/
public
static
Set
<
String
>
getFiles
(
String
dir
)
throws
IOException
{
Set
<
String
>
fileList
=
new
HashSet
<>();
try
(
DirectoryStream
<
Path
>
stream
=
Files
.
newDirectoryStream
(
Paths
.
get
(
dir
)))
{
for
(
Path
path
:
stream
)
{
if
(!
Files
.
isDirectory
(
path
))
{
fileList
.
add
(
FileUtils
.
removeExtension
(
path
.
getFileName
().
toString
()));
}
}
}
return
fileList
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment