diff --git a/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java b/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java index 7839901..8b706b4 100644 --- a/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java +++ b/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java @@ -11,16 +11,13 @@ public class HandleConstants { private final static String letterRif = "147"; private final static String TeiFile = "147-transcription.xml"; - private final static String workDir = "C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/"; - //"C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/"; - //"C:/tmp/Clavius/TEI-MarkUp/08042014/136/"; - // "C:/tmp/MP/1/"; - - //"136_APUG_530_cc.138-139.xml";"aeneis_1.xml"; + private final static String workDir = + //"http://claviusontheweb.it:8080/exist/rest//db/clavius/documents/"; + "/Users/angelodel80/Risorse/sources/clavius_workshop/"+letterRif+"/"; private final static String xmlTeiFile = workDir+TeiFile; - private final static String FullTextFile = workDir+"fullText.txt"; + private final static String FullTextFile = workDir+letterRif+".txt"; private final static String modelforHunPos = "testFirst.model"; private final static String tabFileAnalyzed = workDir+"out-tokens_Lemmata.txt"; @@ -29,8 +26,8 @@ public class HandleConstants { private final static String letterAnalyzed = "/Letter"+HandleConstants.letterRif+"_sentences_Analyzed"; - private final static String pathToHunPos = "C:/opt/hunpos-1.0-win/hunpos-1.0-win/hunpos-tag.exe"; - private final static String pathToHunPosModel = "C:/opt/hunpos-1.0-win/hunpos-1.0-win/"; + private final static String pathToHunPos = "/Users/angelodel80/Risorse/tools/hunpos-1.0-macosx/hunpos-tag"; + private final static String pathToHunPosModel = "/Users/angelodel80/Risorse/tools/"; private final static String xpathForSentences = "/tei:TEI/tei:text/tei:body/tei:div/tei:div/tei:ab/tei:s"; diff --git a/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/ClaviusUtils.java b/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/ClaviusUtils.java index 4f5e703..236f700 100644 --- a/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/ClaviusUtils.java +++ b/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/ClaviusUtils.java @@ -3,6 +3,7 @@ */ package ilc.cnr.it.clavius.utils; +import ilc.cnr.it.clavius.HunposTagger; import ilc.cnr.it.clavius.constants.HandleConstants; import java.io.BufferedInputStream; @@ -21,14 +22,19 @@ import org.apache.commons.io.IOUtils; import org.jdom2.Document; import org.jdom2.Element; +import org.jdom2.JDOMException; +import org.jdom2.Namespace; +import org.jdom2.filter.Filters; import org.jdom2.output.Format; import org.jdom2.output.XMLOutputter; +import org.jdom2.xpath.XPathExpression; +import org.jdom2.xpath.XPathFactory; import com.mysql.jdbc.DocsConnectionPropsHelper; /** * @author angelodel80 - * + * */ public class ClaviusUtils { @@ -38,28 +44,30 @@ public class ClaviusUtils { public ClaviusUtils() { } - public static void verifyFile( File aFile, boolean w ) { + public static void verifyFile(File aFile, boolean w) { if (aFile == null) { throw new IllegalArgumentException("File should not be null."); } if (!aFile.exists()) { - throw new IllegalArgumentException ("File does not exist: " + aFile); + throw new IllegalArgumentException("File does not exist: " + aFile); } if (!aFile.isFile()) { - throw new IllegalArgumentException("Should not be a directory: " + aFile); + throw new IllegalArgumentException("Should not be a directory: " + + aFile); } if (w && !aFile.canWrite()) { - throw new IllegalArgumentException("File cannot be written: " + aFile); + throw new IllegalArgumentException("File cannot be written: " + + aFile); } System.out.println("verifying file: " + aFile.getAbsolutePath() + "\n"); } - public static String streamToString(InputStream is){ + public static String streamToString(InputStream is) { String ret = ""; - if(null != is) { + if (null != is) { BufferedInputStream binp = new BufferedInputStream(is); StringWriter writer = new StringWriter(); @@ -69,7 +77,6 @@ public static String streamToString(InputStream is){ ex.printStackTrace(); } - ret = writer.toString(); try { binp.close(); @@ -89,23 +96,21 @@ public static List makeDocs(BufferedReader reader) { Element doc = null; try { - while (null != (line = reader.readLine() )){ - if(line.matches("")){ + while (null != (line = reader.readLine())) { + if (line.matches("")) { System.err.println("ritorno a capo"); - } - else if( (line.split("\t").length) < 2){ - if(null != doc) + } else if ((line.split("\t").length) < 2) { + if (null != doc) docs.add(doc); System.err.println("testo della sentence: " + line); wordcount = 0; List fields = makeFields(10); - handleSentenceFields(line,fields); + handleSentenceFields(line, fields); doc = new Element("doc").addContent(fields); - } - else { + } else { System.err.println("riga di analisi: " + line); wordcount++; - handleAnalysisFields(line,doc,wordcount); + handleAnalysisFields(line, doc, wordcount); } } docs.add(doc); @@ -118,88 +123,162 @@ else if( (line.split("\t").length) < 2){ } private static void handleAnalysisFields(String line, Element doc, int count) { - try{ - doc.getChildren().get(8).addContent( - makeElement(line.split("\t")[0], line.split("\t")[1], line.split("\t")[2],count,doc.getChildren("field").get(1).getText())); - doc.getChildren().get(9).addContent( - makeElement(line.split("\t")[0], line.split("\t")[1] + line.substring(line.lastIndexOf(9)), line.split("\t")[2],count,doc.getChildren("field").get(1).getText())); - }catch(ArrayIndexOutOfBoundsException e){ - //System.err.println(e.getMessage()); - //e.printStackTrace(); - doc.getChildren().get(8).addContent( - makeElement(line.split("\t")[0], line.split("\t")[1], line.split("\t")[0].substring(line.split("\t")[0].indexOf('@')+1, line.split("\t")[0].indexOf('[')).toLowerCase()+"*",count,doc.getChildren("field").get(1).getText())); - doc.getChildren().get(9).addContent( - makeElement(line.split("\t")[0], line.split("\t")[1] + line.substring(line.lastIndexOf(9)),line.split("\t")[0].substring(line.split("\t")[0].indexOf('@')+1, line.split("\t")[0].indexOf('[')).toLowerCase()+"*",count,doc.getChildren("field").get(1).getText())); + try { + doc.getChildren() + .get(8) + .addContent( + makeElement(line.split("\t")[0], + line.split("\t")[1], line.split("\t")[2], + count, doc.getChildren("field").get(1) + .getText())); + doc.getChildren() + .get(9) + .addContent( + makeElement( + line.split("\t")[0], + line.split("\t")[1] + + line.substring(line + .lastIndexOf(9)), + line.split("\t")[2], count, doc + .getChildren("field").get(1) + .getText())); + } catch (ArrayIndexOutOfBoundsException e) { + // System.err.println(e.getMessage()); + // e.printStackTrace(); + doc.getChildren() + .get(8) + .addContent( + makeElement( + line.split("\t")[0], + line.split("\t")[1], + line.split("\t")[0] + .substring( + line.split("\t")[0] + .indexOf('@') + 1, + line.split("\t")[0] + .indexOf('[')) + .toLowerCase() + + "*", count, + doc.getChildren("field").get(1).getText())); + doc.getChildren() + .get(9) + .addContent( + makeElement( + line.split("\t")[0], + line.split("\t")[1] + + line.substring(line + .lastIndexOf(9)), + line.split("\t")[0] + .substring( + line.split("\t")[0] + .indexOf('@') + 1, + line.split("\t")[0] + .indexOf('[')) + .toLowerCase() + + "*", count, + doc.getChildren("field").get(1).getText())); } } - private static Element makeElement(String token, String pos, String lemma, int count, String sentence){ - return new Element("w").setAttribute("prog", String.valueOf(count)) - .setAttribute("form", token.substring(token.indexOf('@')+1, token.indexOf('[')).toLowerCase()) + private static Element makeElement(String token, String pos, String lemma, + int count, String sentence) { + return new Element("w") + .setAttribute("prog", String.valueOf(count)) + .setAttribute( + "form", + token.substring(token.indexOf('@') + 1, + token.indexOf('[')).toLowerCase()) .setAttribute("pos", pos) .setAttribute("lemma", lemma) - .setAttribute("token",token.substring(token.indexOf('@')+1, token.indexOf('['))) - .setAttribute("extended",token) - .setAttribute("start", String.valueOf(handleOffset(token,sentence,0))) - .setAttribute("end", String.valueOf(handleOffset(token,sentence,1))); + .setAttribute( + "token", + token.substring(token.indexOf('@') + 1, + token.indexOf('['))) + .setAttribute("extended", token) + .setAttribute("start", + String.valueOf(handleOffset(token, sentence, 0))) + .setAttribute("end", + String.valueOf(handleOffset(token, sentence, 1))); } + /* soe : start=0 oppure end=1 */ - public static int handleOffset(String ctsToken, String sentence, int soe){ + public static int handleOffset(String ctsToken, String sentence, int soe) { int ret = 0; int count = 0; - int times = Integer.parseInt(ctsToken.substring(ctsToken.indexOf('[')+1, ctsToken.indexOf(']'))); + int times = Integer.parseInt(ctsToken.substring( + ctsToken.indexOf('[') + 1, ctsToken.indexOf(']'))); System.err.println(times); - String literalToken = ctsToken.substring(ctsToken.indexOf('@')+1, ctsToken.indexOf('[')); + String literalToken = ctsToken.substring(ctsToken.indexOf('@') + 1, + ctsToken.indexOf('[')); String patternQuote = ""; - if(literalToken.matches("\\p{Punct}")){ + if (literalToken.matches("\\p{Punct}")) { System.out.println("in punct:" + literalToken); patternQuote = Pattern.quote(literalToken); - } - else{ - patternQuote = "\\b"+Pattern.quote(literalToken)+"\\b"; + } else { + patternQuote = "\\b" + Pattern.quote(literalToken) + "\\b"; } System.out.println(patternQuote); System.out.println(sentence); - // FIXME attenzione la regex deve avere i boudary del token, altrimenti matcha alche le sottostringhe falsando gli offset + // FIXME attenzione la regex deve avere i boudary del token, altrimenti + // matcha alche le sottostringhe falsando gli offset Pattern pa = Pattern.compile(patternQuote); Matcher ma = pa.matcher(sentence); System.out.println(pa.pattern()); - /* se il token esiste devo prendere la sua giusta occorrenza, quindi scorro i founds finchè non raggiungo l'occorrenza corretta valutando la variabile times */ - while(ma.find()){ - count = count +1; - if(count == times){ - if(0==soe) + /* + * se il token esiste devo prendere la sua giusta occorrenza, quindi + * scorro i founds finchè non raggiungo l'occorrenza corretta valutando + * la variabile times + */ + while (ma.find()) { + count = count + 1; + if (count == times) { + if (0 == soe) ret = ma.start(); else ret = ma.end(); } } - // System.err.println(ma.group()); - // System.err.println(ma.start()); - // System.err.println(ma.end()); + // System.err.println(ma.group()); + // System.err.println(ma.start()); + // System.err.println(ma.end()); return ret; } private static void handleSentenceFields(String line, List fields) { - /* be care the file in input has to start with the correct information with out any header*/ + /* + * be care the file in input has to start with the correct information + * with out any header + */ fields.get(0).getAttribute("name").setValue("id"); - fields.get(0).setText(line.substring(line.indexOf("s_")+2, line.indexOf(":: "))); + fields.get(0).setText( + line.substring(line.indexOf("s_") + 2, line.indexOf(":: "))); fields.get(1).getAttribute("name").setValue("sentence_txt"); - fields.get(1).setText(line.substring(line.indexOf(":: ")+3)); + fields.get(1).setText(line.substring(line.indexOf(":: ") + 3)); fields.get(2).getAttribute("name").setValue("image_url"); - fields.get(2).setText(HandleConstants.getLetterRif()+"-"+line.substring(line.indexOf("s_"), line.indexOf(":: "))+".png"); //OK for Clavius - //fields.get(2).setText(HandleConstants.getLetterRif()+"-"+"xyz.png"); // OK for Virgilius + fields.get(2).setText( + HandleConstants.getLetterRif() + + "-" + + line.substring(line.indexOf("s_"), + line.indexOf(":: ")) + ".png"); // OK for + // Clavius + // fields.get(2).setText(HandleConstants.getLetterRif()+"-"+"xyz.png"); + // // OK for Virgilius fields.get(3).getAttribute("name").setValue("sentence_id"); fields.get(3).setText(line.substring(0, line.indexOf(":: "))); fields.get(4).getAttribute("name").setValue("image_id"); - fields.get(4).setText("CITE for " + line.substring(0, line.indexOf(":: "))); + fields.get(4).setText( + "CITE for " + line.substring(0, line.indexOf(":: "))); fields.get(5).getAttribute("name").setValue("info_sentence"); - fields.get(5).setText("additional information for sentence " + line.substring(0, line.indexOf(":: "))); + fields.get(5).setText( + "additional information for sentence " + + line.substring(0, line.indexOf(":: "))); fields.get(6).getAttribute("name").setValue("info_image"); - fields.get(6).setText("additional information for image " + line.substring(0, line.indexOf(":: "))); + fields.get(6).setText( + "additional information for image " + + line.substring(0, line.indexOf(":: "))); fields.get(7).getAttribute("name").setValue("nota"); fields.get(7).setText(line.substring(0, line.indexOf(":: "))); fields.get(8).getAttribute("name").setValue("sentence_analysis"); @@ -208,16 +287,17 @@ private static void handleSentenceFields(String line, List fields) { private static List makeFields(int n) { List fields = new ArrayList(n); - for(int i = 0; i docsSentences(Element root){ + private static List docsSentences(Element root) { List sentences = null; - if(null!=root){ + if (null != root) { sentences = root.getChildren("doc"); } return sentences; @@ -228,13 +308,17 @@ public static void makeSentenceXML(Document xmlSentences) { XMLOutputter xo = new XMLOutputter(Format.getPrettyFormat()); Element root = xmlSentences.getRootElement(); List docs = docsSentences(root); - System.out.println("in makesentenceXML: size list: " +docs.size()); + System.out.println("in makesentenceXML: size list: " + docs.size()); for (Element doc : docs) { Element tmp_doc = doc.clone(); System.out.println(doc.getChildText("field")); try { - xo.output(new Document().setRootElement(new Element("add").setContent(tmp_doc)), - new FileWriter( HandleConstants.getWorkDir()+ "Letter"+HandleConstants.getLetterRif() +"_an-"+doc.getChildText("field")+".xml")); + xo.output( + new Document().setRootElement(new Element("add") + .setContent(tmp_doc)), + new FileWriter(HandleConstants.getWorkDir() + "Letter" + + HandleConstants.getLetterRif() + "_an-" + + doc.getChildText("field") + ".xml")); } catch (IOException e) { e.printStackTrace(); } @@ -243,8 +327,20 @@ public static void makeSentenceXML(Document xmlSentences) { } - public static void makeIntegrationXMLforAnalysis(Document xmlDoc){ + public static void makeIntegrationXMLforAnalysis(Document xmlDoc) { System.out.println("in makeIntegrationXMLforAnalysis"); + Document teiDoc = null; + try { + teiDoc = TextUtils.fileToDocument(HandleConstants.getWorkDir()+HandleConstants.getTeifile()); + + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + Element root = xmlDoc.getRootElement(); XMLOutputter xo = new XMLOutputter(Format.getPrettyFormat()); List docs = docsSentences(root); @@ -253,72 +349,140 @@ public static void makeIntegrationXMLforAnalysis(Document xmlDoc){ Integer sentenceOffset = new Integer(0); for (Element doc : docs) { Element sentenceAnalysis = new Element("sentence"); - PopulateSentenceAnalysis(sentenceAnalysis, doc); + String ctsUri = PopulateSentenceAnalysis(sentenceAnalysis, doc); sentecesAnalysis.add(sentenceAnalysis); - //sentence offset is for handling start end in a letter scope instead in a sentence scope - sentenceOffset = PopulateTokens(tokens,doc,sentenceOffset); + // sentence offset is for handling start end in a letter scope + // instead in a sentence scope + String[] ctsParts = ctsUri.split("\\."); + System.err.println("CTSURN PER GESTIONE TOKEN DA TEI **** \n"+ ctsParts[ctsParts.length-1]); + + Namespace ns = teiDoc.getRootElement().getNamespace(); + String pathToSentence = HandleConstants.getXpathForSentences()+"[@n='"+ctsParts[ctsParts.length-1]+"']"; + + XPathExpression sentencesExpression = XPathFactory + .instance().compile(pathToSentence, Filters.element(), null, + Namespace.getNamespace("tei", ns.getURI())); + List listOfSentences = sentencesExpression + .evaluate(teiDoc); + + if (null != listOfSentences) { + System.err.println("number of sentences: " + + listOfSentences.size()); + sentenceOffset = PopulateTokens(tokens, doc, listOfSentences.get(0), + sentenceOffset); + } } - try { - xo.output(new Document().setRootElement(new Element("linguistical_analysis").setContent(sentecesAnalysis)), new FileWriter(HandleConstants.getWorkDir()+ "Letter"+HandleConstants.getLetterRif() +"_an.xml")); - }catch(IOException e){ + try { + xo.output(new Document().setRootElement(new Element( + "linguistical_analysis").setContent(sentecesAnalysis)), + new FileWriter(HandleConstants.getWorkDir() + "Letter" + + HandleConstants.getLetterRif() + "_an.xml")); + } catch (IOException e) { e.printStackTrace(); } - try{ - xo.output(new Document().setRootElement(new Element("tokens").setAttribute("uri", "").setContent(tokens)), new FileWriter(HandleConstants.getWorkDir()+ "Letter"+HandleConstants.getLetterRif() +"_tokens.xml")); - }catch(IOException e){ + try { + xo.output(new Document().setRootElement(new Element("tokens") + .setAttribute("uri", "").setContent(tokens)), + new FileWriter(HandleConstants.getWorkDir() + "Letter" + + HandleConstants.getLetterRif() + "_tokens.xml")); + } catch (IOException e) { e.printStackTrace(); } } - private static List getWordFromXmldoc(Element docxml){ + private static List getWordFromXmldoc(Element docxml) { return docxml.getChildren().get(8).getChildren(); } - private static String getCtsuri(Element docxml){ + + private static String getCtsuri(Element docxml) { return docxml.getChildren().get(3).getText(); } - private static String handleSentenceOffSet(Integer offset, String base){ + private static String handleSentenceOffSet(Integer offset, String base) { String ret = null; int baseInt = Integer.valueOf(base).intValue(); - ret = String.valueOf(baseInt+offset.intValue()); + ret = String.valueOf(baseInt + offset.intValue()); return ret; } - private static Integer PopulateTokens(List toks,Element doc,Integer Offset){ + private static Integer PopulateTokens(List toks, Element doc, + Element teiSentence, Integer Offset) { List words = getWordFromXmldoc(doc); int localOffset = 0; + for (Element w : words) { toks.add(new Element("token") - .setAttribute("uri", w.getAttributeValue("extended")) - .setAttribute("start",handleSentenceOffSet(Offset,w.getAttributeValue("start"))) - .setAttribute("end", handleSentenceOffSet(Offset,w.getAttributeValue("end"))) - .addContent(w.getAttributeValue("token")) - ); - localOffset = Integer.valueOf(w.getAttributeValue("end")).intValue(); + .setAttribute("uri", w.getAttributeValue("extended")) + .setAttribute( + "start", + handleSentenceOffSet(Offset, + w.getAttributeValue("start"))) + .setAttribute( + "end", + handleSentenceOffSet(Offset, + w.getAttributeValue("end"))) + .setAttribute("abbr", handleAbbreviation(w.getAttributeValue("token"),teiSentence)) + .setAttribute("subtokens", handleLineBreak(w.getAttributeValue("token"),teiSentence)) + .addContent(w.getAttributeValue("token"))); + localOffset = Integer.valueOf(w.getAttributeValue("end")) + .intValue(); } - Offset = Integer.valueOf(Offset.intValue()+localOffset+1); - System.out.println("Offset: "+Offset +" localOffset: "+localOffset); + Offset = Integer.valueOf(Offset.intValue() + localOffset + 1); + System.out + .println("Offset: " + Offset + " localOffset: " + localOffset); return Offset; } - private static void PopulateSentenceAnalysis(Element newAnalysis, Element oldAnalysis){ + private static String handleLineBreak(String tokenValue, + Element teiSentence) { + // TODO Auto-generated method stub + return teiSentence.getAttributeValue("n"); + } + + private static String handleAbbreviation(String tokenValue, + Element teiSentence) { + // TODO Auto-generated method stub + Namespace ns = teiSentence.getDocument().getRootElement().getNamespace(); + String pathToChoice = "tei:choice"; + Element choice = null; + String ret = ""; + + XPathExpression choicesExpression = XPathFactory + .instance().compile(pathToChoice, Filters.element(), null, + Namespace.getNamespace("tei", ns.getURI())); + List listOfChoice = choicesExpression + .evaluate(teiSentence); + if(!listOfChoice.isEmpty() && null!=listOfChoice){ + choice = listOfChoice.get(0); // FIXME: se nella sentence ci sono più di una choice non funziona questo metodo + System.err.println(tokenValue); + System.err.println(choice.getChild("expan", ns).getText()); + if(tokenValue.equals(choice.getChild("expan", ns).getText())) + ret = choice.getChild("abbr", ns).getText(); + } + + return ret; + + } + + private static String PopulateSentenceAnalysis(Element newAnalysis, + Element oldAnalysis) { List words = getWordFromXmldoc(oldAnalysis); String ctsUri = getCtsuri(oldAnalysis); - //System.out.println(ctsUri); + // System.out.println(ctsUri); newAnalysis.setAttribute("uri", ctsUri); for (Element w : words) { - //System.out.println(w.getAttributeValue("form")); + // System.out.println(w.getAttributeValue("form")); newAnalysis.addContent(new Element("token") - .setAttribute("uri", w.getAttributeValue("extended")) - .setAttribute("prog",w.getAttributeValue("prog")) - .setAttribute("start",w.getAttributeValue("start")) - .setAttribute("end",w.getAttributeValue("end")) - .setAttribute("form",w.getAttributeValue("form")) - .setAttribute("morphoCode",w.getAttributeValue("pos")) - .setAttribute("lemma",w.getAttributeValue("lemma")) - ); + .setAttribute("uri", w.getAttributeValue("extended")) + .setAttribute("prog", w.getAttributeValue("prog")) + .setAttribute("start", w.getAttributeValue("start")) + .setAttribute("end", w.getAttributeValue("end")) + .setAttribute("form", w.getAttributeValue("form")) + .setAttribute("morphoCode", w.getAttributeValue("pos")) + .setAttribute("lemma", w.getAttributeValue("lemma"))); } + return ctsUri; } }