-
Notifications
You must be signed in to change notification settings - Fork 0
/
OpenNLP.java
215 lines (151 loc) · 5.38 KB
/
OpenNLP.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
package StageENSAO;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JTable;
import javax.swing.table.DefaultTableModel;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
public class OpenNLP implements PatternSearch {
static InputStream tokenModelIn = null;
static InputStream posModelIn = null;
static int occurences=0;
static String[] tokens;
static ArrayList<Integer> occurs,position;
static ArrayList<String>origine;
private DefaultTableModel mod;
private JTable resultat;
private String path="D:\\stageENSAO";
public OpenNLP(){
mod=new DefaultTableModel( new Object[][] {},
new String[] { "occurence", "origine", "position"
});
resultat = new JTable(mod);
occurs = new ArrayList<Integer>();
origine = new ArrayList<String>();
position = new ArrayList<Integer>();
}
//----------------------------------------------
//les fonctions
public void search(String path, String motif) {
String text = null;
String []mot=null ;
String[] Lemmas=null;
try {
text = PatternSearch.pdfToTxt(path);
mot =Lemmatizer(tokenize(motif), POSTagger(tokenize(motif)));
} catch (IOException e1) {
e1.printStackTrace(); }
try {
setTokens(tokenize(text));
String[]tags = POSTagger(getTokens());
Lemmas = Lemmatizer(getTokens(),tags);
} catch (IOException e) {
e.printStackTrace();
}
regexChecker(mot[0], Lemmas);
Object [] row =new Object[3];
for (int i = 0; i < occurs.size(); i++) {
row[0]= getOccurs().get(i).toString();
row[1]=getOrigine().get(i).toString();
row[2]=getPosition().get(i).toString();
mod.addRow(row);
}
// }
}
public String[] tokenize(String txt) throws IOException {
tokenModelIn = new FileInputStream(path+"\\binFiles\\en-token.bin");
TokenizerModel tokenModel = new TokenizerModel(tokenModelIn);
Tokenizer tokenizer = new TokenizerME(tokenModel);
String[] tokens = tokenizer.tokenize(txt);
return tokens;
}
public String [] POSTagger(String [] tokns) throws IOException {
posModelIn = new FileInputStream(path+"\\binFiles\\en-pos-maxent.bin");
// loading the parts-of-speech model from stream
POSModel posModel = new POSModel(posModelIn);
// initializing the parts-of-speech tagger with model
POSTaggerME posTagger = new POSTaggerME(posModel);
// Tagger tagging the tokens
String[] tags= posTagger.tag(tokns);
// Getting the probabilities of the tags given to the tokens
double[] probs= posTagger.probs();
return tags ;
}
//***** getters & setters
public static int getOccurences() {
return occurences;
}
public static void setOccurences(int occurences) {
OpenNLP.occurences = occurences;
}
public static ArrayList<Integer> getOccurs() {
return occurs;
}
public static void setOccurs(ArrayList<Integer> occurs) {
OpenNLP.occurs = occurs;
}
public static ArrayList<Integer> getPosition() {
return position;
}
public static void setPosition(ArrayList<Integer> position) {
OpenNLP.position = position;
}
public static ArrayList<String> getOrigine() {
return origine;
}
public static void setOrigine(ArrayList<String> origine) {
OpenNLP.origine = origine;
}
public void setResultat(JTable resultat) {
this.resultat = resultat;
}
public JTable getResultat() {
return resultat;
}
public String[] Lemmatizer(String [] t1,String []t2) throws IOException{
// loading the dictionary to input stream
InputStream dictLemmatizer = new FileInputStream(path+"\\en-lemmatizer.txt");
// loading the lemmatizer with dictionary
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
// finding the lemmas
String[] lemmas = lemmatizer.lemmatize(t1, t2);
return lemmas;
}
public String[] Chunker(String [] tk, String []tg) throws IOException {
InputStream ins = new FileInputStream(path+"\\binFiles\\en-chunker.bin");
// loading the chunker model
ChunkerModel chunkerModel = new ChunkerModel(ins);
// initializing chunker(maximum entropy) with chunker model
ChunkerME chunker = new ChunkerME(chunkerModel);
// chunking the given sentence : chunking requires sentence to be tokenized and pos tagged
String[] chunkss = chunker.chunk(tk,tg);
return chunkss;
}
public static String[] getTokens() {
return tokens;
}
public static void setTokens(String[] tokens) {
OpenNLP.tokens = tokens;
}
public static void regexChecker(String motifLemmatized, String []str2Check){
Pattern checkRegex = Pattern.compile(motifLemmatized);
for (int i = 0; i < str2Check.length; i++) {
Matcher regexMatcher = checkRegex.matcher(str2Check[i]);
while(regexMatcher.find()) {
setOccurences(getOccurences()+1);
occurs.add(getOccurences());
origine.add(tokens[i]);
position.add(i);
}
}}
}