|
| 1 | +## Persianp Processing Toolbox |
| 2 | + |
| 3 | +Persianp is a text processing tool developed in Java to accomplish preprocessing tasks in Persian texts. The toolbox accomplishes following task: |
| 4 | +* Character-level normalization |
| 5 | +* Tokenization |
| 6 | +* Lemmatization |
| 7 | +* POS tagging |
| 8 | +* Stopword detection |
| 9 | +* Noun phrase chunking |
| 10 | + |
| 11 | +### Using Persianp from the command line |
| 12 | +Be sure folder 'res' is next to the 'jar' file. |
| 13 | + |
| 14 | +'''bash |
| 15 | +$ java -cp persianp-toolbox-1.0.jar com.persianp.nlp.process.Process -input inputfile.txt -output outputfile.txt -task (tokenize|tag|lemmatize|taglemmatize) [-nostopword] [-prop propertyFile.properties] |
| 16 | +''' |
| 17 | + |
| 18 | +At the moment NP chunking is not supported from the comand line. |
| 19 | + |
| 20 | +### Using the Persianp API |
| 21 | +Add the API to libraries of your program. The following example shows how to use the toolbox. |
| 22 | + |
| 23 | +''' |
| 24 | +public class TestPersianp { |
| 25 | + |
| 26 | + public static void main(String[] args) { |
| 27 | + TestPersianp testPersianp = new TestPersianp(); |
| 28 | + testPersianp.process(); |
| 29 | + } |
| 30 | + |
| 31 | + private void process() { |
| 32 | + try { |
| 33 | + Properties properties = new Properties(); |
| 34 | + properties.load(this.getClass().getClassLoader().getResourceAsStream("persianp.properties")); |
| 35 | + Process process = new Process(properties); |
| 36 | + InputStream in = this.getClass().getClassLoader().getResourceAsStream("testText.txt"); |
| 37 | + BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); |
| 38 | + String line; |
| 39 | + while ((line = br.readLine()) != null) { |
| 40 | + process.process(line); |
| 41 | + |
| 42 | + System.out.println(process.getText()); |
| 43 | +// process.getTokens(); |
| 44 | +// process.getTokensText(); |
| 45 | +// process.getTags(); |
| 46 | +// process.getChunkTag(); |
| 47 | +// process.getLemmas(); |
| 48 | +// process.getNonStopwordTokens(); |
| 49 | + |
| 50 | + int sentenceSize = process.getSentencesSize(); |
| 51 | + for (int j = 0; j < sentenceSize; ++j) { |
| 52 | +// List tokensText = process.getTokensTextInSentence(j); |
| 53 | +// List tags = process.getTagsInSentence(j); |
| 54 | +// List lemmas = process.getLemmasInSentence(j); |
| 55 | + List tokens = process.getTokensInSentence(j); |
| 56 | + for (int k = 0; k < tokens.size(); ++k) { |
| 57 | + System.out.println(tokens.get(k).getText() + "\t\t\t" + tokens.get(k).getLemma() + "\t\t\t" + tokens.get(k).getTag()); |
| 58 | + } |
| 59 | + } |
| 60 | + } |
| 61 | + in.close(); |
| 62 | + br.close(); |
| 63 | + } catch (Exception e){ |
| 64 | + e.printStackTrace(); |
| 65 | + } |
| 66 | + } |
| 67 | +} |
| 68 | + |
| 69 | +''' |
| 70 | + |
| 71 | +### More Information / Citing This Toolbox |
| 72 | +Please cite the paper below if you use the Persianp toolbox in your research. It also provides more information about the toolbox. |
| 73 | + |
| 74 | +> Mahdi Mohseni, Javad Ghofrani, Heshaam Faili |
| 75 | +> Persianp: A Persian Text Processing Toolbox |
| 76 | +> International Conference on Intelligent Text Processing and Computational Linguistics |
| 77 | +CICLing 2016: Computational Linguistics and Intelligent Text Processing pp 75-87 |
| 78 | + |
| 79 | +Bibtex citation: |
| 80 | + |
| 81 | +''' |
| 82 | +@InProceedings{Persianp2016, |
| 83 | +author="Mohseni, Mahdi |
| 84 | +and Ghofrani, Javad |
| 85 | +and Faili, Heshaam", |
| 86 | +title="Persianp: A Persian Text Processing Toolbox", |
| 87 | +booktitle="Computational Linguistics and Intelligent Text Processing", |
| 88 | +year="2018", |
| 89 | +publisher="Springer International Publishing", |
| 90 | +pages="75--87", |
| 91 | +isbn="978-3-319-75477-2" |
| 92 | +} |
| 93 | +''' |
| 94 | + |
0 commit comments