root/trunk/lib/parser.py

Revision 28, 22.8 kB (checked in by mjoc, 6 months ago)

Switched to built-in XML library, external XMLLib is not being used
anymore. Minor version number increased to 0.6.4.

Line 
1 #
2 # OpenDict
3 # Copyright (c) 2003-2006 Martynas Jocius <martynas.jocius@idiles.com>
4 # Copyright (c) 2007 IDILES SYSTEMS, UAB <support@idiles.com>
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your opinion) any later version.
10 #
11 # This program is distributed in the hope that will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MECHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more detals.
15 #
16 # You shoud have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 # 02111-1307 USA
20 #
21
22 import time
23 import string
24 import re
25 import os
26 import traceback
27 import xml.parsers.expat
28
29 from lib.extra import dictclient
30 from lib.extra import dictdlib
31 from lib import info
32 from lib import misc
33 from lib import errortype
34 from lib import meta
35 from lib import plaindict
36 from lib.logger import systemLog, debugLog, DEBUG, INFO, WARNING, ERROR
37
38
39 WORD_BG = "#dde2f1" # Bright blue
40 DICT_BG = "#b4bedb"
41
42 class SlowoParser(plaindict.PlainDictionary):
43    """
44    Built-in Slowo Parser
45
46    Parses file in Slowo format.
47    """
48
49    def __init__(self, filePath):
50       """Initialize"""
51
52       self.filePath = filePath
53       self.needsList = True
54      
55       self.name = os.path.splitext(os.path.basename(filePath))[0]
56
57       # Additional information
58       self.encoding = None
59       self.checksum = None
60       self.index = None
61
62       self.configChanged = False
63
64
65    def start(self):
66       """Open file handle"""
67
68       debugLog(DEBUG, "Opening file %s" % self.filePath)
69       self.fd = open(self.filePath)
70
71
72    def stop(self):
73       """Close file handle"""
74
75       try:
76          debugLog(DEBUG, "Closing file %s" % self.filePath)
77          self.fd.close()
78       except:
79          pass
80
81
82    def setIndex(self, index):
83       """Set index table"""
84
85       self.index = index
86
87
88    def getPath(self):
89       """Return full file path"""
90
91       return self.filePath
92
93
94    def setChecksum(self, newSum, first=False):
95       """Set checksum. Used after checksum change"""
96
97       if self.checksum == None:
98          self.configChanged = True
99
100       self.checksum = newSum
101
102
103    def getChecksum(self):
104       """Return checksum"""
105
106       return self.checksum
107
108
109    def getType(self):
110       """Return dictionary type"""
111
112       from lib import dicttype
113       return dicttype.SLOWO
114
115
116    def setName(self, name):
117       """Set new name"""
118
119       self.name = name
120
121
122    def getName(self):
123       """Return file name"""
124
125       return self.name
126
127
128    def setEncoding(self, encoding):
129       """Set encoding"""
130
131       self.encoding = encoding
132       self.configChanged = True
133
134
135    def getEncoding(self):
136       """Return encoding set for that dictionary"""
137
138       return self.encoding
139
140
141    def getUsesWordList(self):
142       """Return True if uses word list, False otherwise"""
143
144       return self.needsList
145
146
147    def _appendTranslation(self, html, orig, trans):
148       """Appends HTML strings to list"""
149
150       html.append("<table width=\"100%\"><tr>")
151       html.append("<td bgcolor=\"%s\">" % WORD_BG)
152       html.append("<b>%s</b></td></tr>" % orig)
153       html.append("<tr><td>")
154       html.append("<p>%s</p>" % trans)
155       html.append("</td></tr></table>")
156      
157
158    def search(self, word):
159       """Lookup word"""
160
161       _start = time.time()
162
163       word_lowered = word.lower()
164
165       encodedIndex = {}
166       for literal in self.index:
167          encodedIndex[literal.encode(self.getEncoding())] = \
168                       self.index.get(literal)
169
170       #
171       # Seek to the beginning of the block
172       #
173       position = 0L
174       if word_lowered[:2] in encodedIndex.keys():
175          position = encodedIndex[word_lowered[:2]]
176
177       debugLog(DEBUG, "Index: %s->%d" % (word_lowered[:2], position))
178       debugLog(DEBUG, "SlowoParser: Seeking to %d" % position)
179      
180       self.fd.seek(position)
181
182       html = []
183
184       html.append("<html><head>")
185       html.append("<meta http-equiv=\"Content-Type\" " \
186                   "content=\"text/html; charset=%s\">" \
187                   % str(self.getEncoding()))
188       html.append("<head><body>")
189
190       found = False
191       words = []
192
193       result = meta.SearchResult()
194
195       # DEBUG
196       _linesRead = 0
197
198       for line in self.fd.xreadlines():
199          _linesRead += 1
200          line = line.strip()
201          try:
202             orig = ""
203             end = ""
204             try:
205                 orig, end = line.split('=', 1)
206             except ValueError, e:
207                 systemLog(ERROR, '%s (line %s)' % (e, line))
208             orig = orig.strip()
209             chunks = end.split(';')
210
211             translation = ["<ul>"]
212             for chunk in chunks:
213                comment = []
214                trans = chunk.split('//')
215                
216                if len(trans) > 1:
217                   comment = trans[1:]
218
219                trans = trans[:1]
220                  
221                trans = "".join(trans).strip()
222                comment = "".join(comment).strip()
223                
224                if len(trans) and len(comment) != 0:
225                   translation.append("<li>%s (<i>%s</i>)</li>" \
226                                      % (trans, comment))
227                elif len(trans):
228                   translation.append("<li>%s</li>" % trans)
229
230             translation.append("</ul>")
231
232             translation = "".join(translation)
233
234          except:
235             traceback.print_exc()
236             continue
237
238          if line.lower().startswith(word_lowered):   
239            
240             if not orig.lower().startswith(word_lowered):
241                break
242            
243             if orig.lower() == word_lowered and not found:
244                found = True
245                self._appendTranslation(html, orig, translation)               
246                
247             words.append(orig)
248             if len(words) == 1:
249                suggestedWord = orig
250                suggestedTrans = translation
251          elif len(words):
252             break
253
254       debugLog(DEBUG, "%d lines scanned" % _linesRead)
255      
256       if not found:
257          if words:
258             self._appendTranslation(html, suggestedWord, suggestedTrans)
259          else:
260             result.setError(errortype.NOT_FOUND)
261
262       html.append("</font></body></html>")
263
264       try:
265          translation = "".join(html)
266       except:
267          result.setError(errortype.INVALID_ENCOFING)
268          translation = ""
269      
270       result.setTranslation(translation)
271       result.setWordList(words)
272
273       debugLog(DEBUG, "SlowoParser: search took %f seconds" \
274             % (time.time() - _start))
275
276       return result
277
278
279
280 class MovaParser(plaindict.PlainDictionary):
281    """
282    Built-in Mova Parser
283
284    Parses file in 'Mova' dictionary format and does
285    the search.
286    """
287
288    def __init__(self, filePath):
289       """Initialize"""
290
291       self.filePath = filePath
292       self.needsList = True
293
294       self.name = os.path.splitext(os.path.basename(filePath))[0]
295
296       # Additional variables
297       self.encoding = None
298       self.checksum = None
299       self.index = None
300      
301
302       # If this is True when closing, the new configuration will be
303       # written to disk
304       self.configChanged = False
305
306
307    def start(self):
308       """Open file handle"""
309
310       debugLog(DEBUG, "Opening file %s" % self.filePath)
311       self.fd = open(self.filePath)
312      
313
314    def stop(self):
315       """Close file handle"""
316
317       try:
318          debugLog(DEBUG, "Closing file %s" % self.filePath)
319          self.fd.close()
320       except:
321          pass
322          
323
324    def setIndex(self, index):
325       """Set index table"""
326
327       self.index = index
328
329
330    def getPath(self):
331       """Return full file path"""
332
333       return self.filePath
334
335
336    def setChecksum(self, newSum, first=False):
337       """Set checksum. Used after chekcsum change"""
338
339       if self.checksum == None:
340          self.configChanged = True
341
342       self.checksum = newSum
343
344       # If checksum is set not for the first time, remember to
345       # update configuration
346       #if not first:
347       #   self.configChanged = True
348
349
350    def getChecksum(self):
351       """Return checksum"""
352
353       return self.checksum
354      
355
356    def getType(self):
357       """Return dictionary type"""
358
359       from lib import dicttype
360       return dicttype.MOVA
361
362
363    def setName(self, name):
364       """Set new name"""
365
366       self.name = name
367
368
369    def getName(self):
370       """Return file name"""
371
372       return self.name
373
374
375    def setEncoding(self, encoding):
376       """Set encoding"""
377
378       self.encoding = encoding
379       self.configChanged = True
380
381
382    def getEncoding(self):
383       """Return encoding set for this dictionary"""
384
385       return self.encoding
386
387
388    def getUsesWordList(self):
389       """Return True if uses word list, False otherwise"""
390
391       return self.needsList
392
393
394    def _appendTranslation(self, html, orig, trans):
395       """Appends HTML strings to list"""
396
397       html.append("<table width=\"100%\"><tr>")
398       html.append("<td bgcolor=\"%s\">" % WORD_BG)
399       html.append("<b>%s</b></td></tr>" % orig)
400       html.append("<tr><td>")
401       html.append("<p>%s</p>" % trans)
402       html.append("</td></tr></table>")
403      
404
405    def search(self, word):
406       """Lookup word"""
407
408       _start = time.time()
409
410       word_lowered = word.lower()
411
412       encodedIndex = {}
413       for literal in self.index:
414          encodedIndex[literal.encode(self.getEncoding())] = \
415                       self.index.get(literal)
416
417       #
418       # Seek to the beginning of the block
419       #
420       position = 0L
421       if word_lowered[:2] in encodedIndex.keys():
422          position = encodedIndex[word_lowered[:2]]
423
424       debugLog(DEBUG, "Index: %s->%d" % (word_lowered[:2], position))
425       debugLog(DEBUG, "MovaParser: Seeking to %d" % position)
426       self.fd.seek(position)
427
428       html = []
429
430       html.append("<html><head>")
431       html.append("<meta http-equiv=\"Content-Type\" " \
432                   "content=\"text/html; charset=%s\">" \
433                   % str(self.getEncoding()))
434       html.append("<head><body>")
435
436       found = False
437       words = []
438
439       result = meta.SearchResult()
440
441       # DEBUG
442       _linesRead = 0
443
444       for line in self.fd.xreadlines():
445          _linesRead += 1
446          line = line.strip()
447          try:
448             orig, trans = line.split("  ", 1)
449          except:
450             continue
451
452          if line.lower().startswith(word_lowered):   
453            
454             if not orig.lower().startswith(word_lowered):
455                break
456            
457             if orig.lower() == word_lowered and not found:
458                found = True
459                self._appendTranslation(html, orig, trans)               
460                
461             words.append(orig)
462             if len(words) == 1:
463                suggestedWord = orig
464                suggestedTrans = trans
465          elif len(words):
466             break
467
468       debugLog(DEBUG, "%d lines scanned" % _linesRead)
469      
470       if not found:
471          if words:
472             self._appendTranslation(html, suggestedWord, suggestedTrans)
473          else:
474             result.setError(errortype.NOT_FOUND)
475
476       html.append("</font></body></html>")
477
478       try:
479          translation = "".join(html)
480       except:
481          result.setError(errortype.INVALID_ENCOFING)
482          translation = ""
483      
484       result.setTranslation(translation)
485       result.setWordList(words)
486
487       debugLog(DEBUG, "MovaParser: Search took %f seconds" \
488                % (time.time() - _start))
489
490       return result
491
492
493
494 # FIXME: Deprecated
495 class TMXParser(plaindict.PlainDictionary):
496     """Built-in TMX parser.
497     Reads TMX files and does the search.
498     """
499
500     def __init__(self, filePath):
501
502        systemLog(WARNING, "***")
503        systemLog(WARNING, "*** WARNING:")
504        systemLog(WARNING, "*** TMX implementation is fuzzy and should " \
505                  "not be used yet!")
506        systemLog(WARNING, "***")
507
508        self.name = os.path.splitext(os.path.basename(filePath))[0]
509        self.needsList = True
510        self.encoding = None
511
512        self.mapping = {}
513        self.header = {}
514        self.trans = []
515        self.inSeg = 0
516        self.lang = ""
517
518
519     def start(self):
520        """Allocate resources"""
521
522        parser = xml.parsers.expat.ParserCreate()
523        parser.StartElementHandler = self.startElement
524        parser.EndElementHandler = self.endElement
525        parser.CharacterDataHandler = self.charData
526
527        if file != "":
528           fd = open(file)
529           parser.Parse(fd.read(), 1)
530           fd.close()
531
532
533     def getType(self):
534       """Return dictionary type"""
535
536       return dicttype.TMX
537
538
539     def setName(self, name):
540       """Set new name"""
541
542       self.name = name
543
544
545     def getName(self):
546        """Return file name"""
547
548        return self.name
549
550
551     def setEncoding(self, encoding):
552       """Set encoding"""
553
554       self.encoding = encoding
555
556    
557     def getEncoding(self):
558        """Return encoding set for that dictionary"""
559        
560        return wx.GetApp().config.encoding
561
562
563     def getUsesWordList(self):
564        """Return True if uses word list, False otherwise"""
565        
566        return self.needsList
567
568            
569     def startElement(self, name, attrs):
570        """Part of SAX parsing method"""
571
572        if name == "tu":
573           self.inTu = 1
574        elif name == "tuv":
575           self.inTuv = 1
576           self.lang = attrs["lang"]
577        elif name == "seg":
578           self.inSeg = 1
579        elif name == "header":
580           self.header["srclang"] = attrs["srclang"]
581           self.header["creationtool"] = attrs["creationtool"]
582           self.header["creationtoolversion"] = attrs["creationtoolversion"]
583           self.header["o-tmf"] = attrs["o-tmf"]
584           self.header["adminlang"] = attrs["adminlang"]
585           self.header["datatype"] = attrs["datatype"]
586           self.header["segtype"] = attrs["segtype"]
587
588
589     def endElement(self, name):
590        """Part of SAX parsing method"""
591
592        if name == "tu":
593           self.inTu = 0
594           self.mapping.setdefault(self.orig, []).extend(self.trans)
595           self.trans = []
596        elif name == "tuv":
597           self.inTuv = 0
598        elif name == "seg":
599           self.inSeg = 0
600
601
602     def charData(self, data):
603        """Part of SAX parsing method"""
604
605        if self.inSeg:
606           if self.lang == self.header["srclang"]:
607              self.orig = data
608           else:
609              self.trans.append(data)
610
611
612     def search(self, word):
613        """Lookup word"""
614        
615        errno = 0
616
617        result = "<html><head>" \
618                 "<meta http-equiv=\"Content-Type\" " \
619                 "content=\"text/html; charset=%s\">" \
620                 "</head><body>"
621                 #"<font face=\"%s\" size=\"%s\">" % (self.window.encoding,
622                 #                                    self.window.app.config.fontFace,
623                 #                                    self.window.app.config.fontSize)
624
625        keys = self.mapping.keys()
626        avail = []
627        found = False
628        word_lowered = word.lower()
629
630        for key in keys:
631           if key.lower().find(word_lowered) == 0:
632              avail.append(key)
633              if not found:
634                  result += "<u><b>%s</b></u><br>" % key
635                  result += "<table><tr><td>"
636                  result += "&nbsp;"*3+str("<br>"+"&nbsp;"*3).join(self.mapping[key])
637                  result += "</td></tr></table>"
638                  found = True
639
640        result += "</font></body></html>"
641
642        if len(avail) == 0:
643           errno = 1
644
645        return (result, avail, errno)
646
647
648     def makeHashTable(self):
649        pass
650          
651
652
653 class DictParser(plaindict.PlainDictionary):
654    """Built-in dictd dictionaries parser.
655    Reads dictd dictionaries and does the search.
656    """
657
658    def __init__(self, filePath):
659       """Initialize"""
660
661       self.filePath = filePath
662       self.needsList = True
663       self.name = os.path.splitext(os.path.splitext(os.path.basename(filePath))[0])[0]
664       self.encoding = 'UTF-8'
665       self.checksum = None
666
667       self.configChanged = False
668
669       self.dict = None
670       self.definitions = None
671
672
673    def start(self):
674       """Allocate resources"""
675
676       name = os.path.splitext(os.path.splitext(\
677          os.path.basename(self.filePath))[0])[0]
678       indexFile = os.path.join(os.path.dirname(self.filePath),
679                                name)
680       self.dict = dictdlib.DictDB(indexFile)
681
682
683    def stop(self):
684       """Free resources"""
685
686       if self.dict:
687          del self.dict
688
689
690    def getPath(self):
691       """Return full file path"""
692
693       return self.filePath
694
695
696    def getType(self):
697       """Return dictionary type"""
698
699       from lib import dicttype
700       return dicttype.DICT
701
702    
703    def setName(self, name):
704       """Set new name"""
705
706       self.name = name
707
708
709    def getName(self):
710       """Return file name"""
711      
712       return self.name
713
714
715    def setEncoding(self, encoding):
716       """Set encoding"""
717
718       self.encoding = encoding
719    
720
721    def getEncoding(self):
722       """Return encoding set for that dictionary"""
723
724       return self.encoding
725
726
727    def setChecksum(self, newSum):
728       """Set checksum. Used after chekcsum change"""
729
730       if self.checksum == None:
731          self.configChanged = True
732
733       self.checksum = newSum
734
735
736    def getUsesWordList(self):
737       """Return True if uses word list, False otherwise"""
738
739       return self.needsList
740
741
742    def _getTranslation(self, word):
743       """Return word and translation code without formatting
744       full HTML document"""
745
746       translations = self.dict.getdef(word)
747
748       orig = None
749       translation = None
750      
751       for source in translations:
752          chunks = source.split('\n')
753          map(string.strip, chunks)
754          
755          orig = chunks[0]
756          pron = re.findall("\[(.*?)\]", orig)
757          if len(pron) > 0:
758             orig = "<b>%s</b> [<i>%s</i>]" % \
759                    (orig.replace(" [%s]" % pron[0], ""), pron[0])
760          else:
761             orig = "<b>%s</b>" % orig
762
763          translation = ['<ul>']
764          for c in chunks[1:]:
765             if len(c) > 0:
766                translation.append("<li>%s</li>" % c)
767          translation.append('</ul>')
768
769          translation = "".join(translation)
770          
771          links = re.findall("{(.*?)}", translation)
772          for link in links:
773             translation = translation.replace("{%s}" % link,
774                                               "<a href=\"%s\">%s</a>" \
775                                               % (link, link))
776
777       return (orig, translation)
778
779
780    def search(self, word):
781       """Lookup word"""
782
783       _start = time.time()
784
785       result = meta.SearchResult()
786
787       word_lowered = word.lower()
788      
789       if self.definitions is None:
790          self.definitions = self.dict.getdeflist()
791          self.definitions.sort()
792
793       words = []
794
795       for definition in self.definitions:
796          if definition.lower().startswith(word_lowered):
797             words.append(definition)
798
799       html = []
800
801       html.append("<html><head>")
802       html.append("<meta http-equiv=\"Content-Type\" " \
803                   "content=\"text/html; charset=%s\">" \
804                   % str(self.getEncoding()))
805       html.append("<head><body>")
806
807       (orig, translation) = self._getTranslation(word)
808
809       if not translation:
810          if len(words):
811             debugLog(DEBUG, "Retrying search...")
812             _word = words[0]
813             orig, translation = self._getTranslation(_word)
814             if not translation:
815                result.setError(errortype.NOT_FOUND)
816          else:
817             result.setError(errortype.NOT_FOUND)
818             translation = ""
819
820       html.append("<table width=\"100%\"><tr>")
821       html.append("<td bgcolor=\"%s\">" % WORD_BG)
822       html.append("<b>%s</b></td></tr>" % orig)
823       html.append("<tr><td>")
824       html.append("<p>%s</p>" % translation)
825       html.append("</td></tr></table>")
826       html.append("</body></html>")
827
828       result.setTranslation("".join(html))
829       result.setWordList(words)
830
831       debugLog(DEBUG, "DictParser: Search took % f seconds" \
832             % (time.time() - _start))
833
834       return result
835
836
837 # TODO:
838 # 1. This is not a parser, move to another module
839 # 2. Add needed methods
840 #
841 class DictConnection(meta.Dictionary):
842    """Built-in DICT client
843    Connects to a DICT server abd does the search.
844    """
845
846    def __init__(self, server, port, db, strategy):
847
848       self.server = server
849       self.port = port
850       self.db = db
851       self.strategy = strategy
852       self.encoding = "UTF-8"
853       self.needsList = 0
854       self.name = 'Connection to DICT server'
855
856
857    def getUsesWordList(self):
858       """Return True if uses word list, False otherwise"""
859
860       return self.needsList
861
862
863    def setName(self, name):
864       """Set new name"""
865
866       self.name = name
867
868
869    def getName(self):
870       """Return name"""
871
872       return self.name
873
874
875    def setEncoding(self, encoding):
876       """Set encoding"""
877
878       self.encoding = encoding
879
880
881    def getEncoding(self):
882       """Return encoding"""
883
884       return self.encoding
885
886
887    def search(self, word):
888       """Lookup word"""
889
890       result = meta.SearchResult()
891
892       try:
893          conn = dictclient.Connection(self.server, self.port)
894       except:
895          result.setError(errortype.CONNECTION_ERROR)
896          return result
897
898       html = []
899       html.append("<html><head>" \
900                   "<meta http-equiv=\"Content-Type\" " \
901                   "content=\"text/html; charset=%s\">" \
902                   "</head><body>" % self.getEncoding())
903
904       found = False
905
906       try:
907          data = conn.define(self.db, word)
908       except:
909          data = []
910
911       for d in data:
912          found = True
913
914          html.append("<p><table width=\"100%\"><tr>")
915          html.append("<td bgcolor=\"%s\">" % DICT_BG)
916          html.append("<b><i>%s</i></b></td></tr>" % d.getdb().getdescription())
917
918          source = d.getdefstr()
919          source = source.replace('<', '&lt;')
920          source = source.replace('>', '&gt;')
921          orig = source.split("\n", 1)[0]
922          
923          pron = re.findall("\[(.*?)\]", orig) # 1st comment type
924          pronPatt = " [%s]"
925          
926          if len(pron) == 0:
927             pron = re.findall("\/(.*?)\/", orig) # 2nd comment type
928             pronPatt = " /%s/"
929          if len(pron) == 0:
930             pron = re.findall(r"\\(.*?)\\", orig) # 3rd comment type
931             pronPatt = " \\%s\\"
932          
933          if len(pron) > 0:
934             orig = "<b>%s</b> [<i>%s</i>]" % \
935                    (orig.replace(pronPatt % pron[0], ""), pron[0])
936          else:
937             orig = "<b>%s</b>" % orig
938
939          html.append("<tr><td bgcolor=\"%s\">" % WORD_BG)
940          html.append("%s</td></tr>" % orig)
941
942          source = source.replace('\n\n', '<br><br>')
943          
944          translation = ' '.join(source.split('\n')[:])
945          links = re.findall("{(.*?)}", translation)
946          for link in links:
947             translation = translation.replace("{%s}" % link,
948                               "<a href=\"%s\">%s</a>" % (link, link))
949          html.append("<tr><td>%s</td></tr>" % translation)
950          html.append("</table></p>")
951
952       html.append("</body></html>")
953
954       result.setTranslation(''.join(html))
955      
956       if not found:
957          result.setError(errortype.NOT_FOUND)
958
959       return result
Note: See TracBrowser for help on using the browser.