|
1 | 1 | /* This library is under the 3-Clause BSD License |
2 | 2 |
|
3 | | - Copyright (c) 2018-2024, Orange S.A. |
| 3 | + Copyright (c) 2018-2025, Orange S.A. |
4 | 4 |
|
5 | 5 | Redistribution and use in source and binary forms, with or without modification, |
6 | 6 | are permitted provided that the following conditions are met: |
|
28 | 28 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | 29 |
|
30 | 30 | @author Johannes Heinecke |
31 | | - @version 2.27.0 as of 28th September 2024 |
| 31 | + @version 2.30.0 as of 12th April 2025 |
32 | 32 | */ |
33 | 33 | package com.orange.labs.conllparser; |
34 | 34 |
|
@@ -82,36 +82,23 @@ public class ConllFile { |
82 | 82 | * open CoNLL-U File and read its contents |
83 | 83 | * |
84 | 84 | * @param file CONLL file |
85 | | - * @param ignoreSentencesWithoutAnnot ignore sentences which do not have any |
86 | | - * information above columns 12 |
87 | | - * @param ignoreSentencesWithoutTarget ignore sentences which do not have |
88 | | - * any target as annotation |
89 | 85 | * @throws IOException |
90 | | - * @throws com.orange.labs.nlp.conllparser.ConllWord.ConllWordException |
| 86 | + * @throws ConllException |
91 | 87 | */ |
92 | | - public ConllFile(File file/*, boolean ignoreSentencesWithoutAnnot, boolean ignoreSentencesWithoutTarget*/) throws IOException, ConllException { |
| 88 | + public ConllFile(File file) throws IOException, ConllException { |
93 | 89 | this.file = file; |
94 | 90 | FileInputStream fis = new FileInputStream(file); |
95 | | - parse(fis /*, ignoreSentencesWithoutAnnot, ignoreSentencesWithoutTarget*/); |
| 91 | + parse(fis); |
96 | 92 | fis.close(); |
97 | 93 | } |
98 | 94 |
|
99 | 95 | /** |
100 | | - * |
101 | | - * @param filecontents contenu du fichier COLL |
102 | | - * @param ignoreSentencesWithoutAnnot ignore sentences which do not have any |
103 | | - * information above columns 12 |
104 | | - * @param ignoreSentencesWithoutTarget ignore sentences which do not have |
105 | | - * any target as annotation |
| 96 | + * @param file |
| 97 | + * @param cs class to use instead of ConllSentence (must be a subclass) |
| 98 | +
|
106 | 99 | * @throws ConllException |
107 | 100 | * @throws IOException |
108 | 101 | */ |
109 | | -// public ConllFile(String filecontents/*, boolean ignoreSentencesWithoutAnnot, boolean ignoreSentencesWithoutTarget*/) throws ConllException, IOException { |
110 | | -// this.file = new File("__contents__"); |
111 | | -// InputStream inputStream = new ByteArrayInputStream(filecontents.getBytes(StandardCharsets.UTF_8)); |
112 | | -// parse(inputStream/*, ignoreSentencesWithoutAnnot, ignoreSentencesWithoutTarget*/); |
113 | | -// } |
114 | | - |
115 | 102 | public ConllFile(File file, Class<? extends ConllSentence> cs) throws IOException, ConllException { |
116 | 103 | this.file = file; |
117 | 104 | conllsentenceSubclass = cs; |
@@ -316,6 +303,28 @@ public List<ConllSentence> getSentences() { |
316 | 303 | return sentences; |
317 | 304 | } |
318 | 305 |
|
| 306 | + /** get the sentence which contains the linenumber ln (in the conllu file). |
| 307 | + * if any sentence is modified we recalculate. For long files this can take some time |
| 308 | + * @param ln the line number for which we search the sentence |
| 309 | + * @return an arry of sentence number, position of the given line in the sentence, comments length |
| 310 | + */ |
| 311 | + public int[] getSentence_with_line(int ln) { |
| 312 | + int ends_after = 0; |
| 313 | + int first_line = 1; |
| 314 | + int sn = 0; |
| 315 | + for (ConllSentence csent : sentences) { |
| 316 | + ends_after += csent.get_source_length(); |
| 317 | + if (ends_after >= ln) { |
| 318 | + int[] sn_offset = {sn, first_line, csent.get_comment_length()}; |
| 319 | + |
| 320 | + return sn_offset; |
| 321 | + } |
| 322 | + first_line = ends_after + 1; |
| 323 | + sn++; |
| 324 | + } |
| 325 | + return null; |
| 326 | + } |
| 327 | + |
319 | 328 | public void addSentences(List<ConllSentence> s) { |
320 | 329 | sentences.addAll(s); |
321 | 330 | } |
|
0 commit comments