root/tags/release-0.6.2/lib/parser.py

Revision 1, 22.8 kB (checked in by mjoc, 2 years ago)

Initial import.

Line 
1 #
2 # OpenDict
3 # Copyright (c) 2003-2005 Martynas Jocius <mjoc@akl.lt>
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your opinion) any later version.
9 #
10 # This program is distributed in the hope that will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MECHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more detals.
14 #
15 # You shoud have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 # 02111-1307 USA
19 #
20
21 import time
22 import string
23 import re
24 import os
25 import traceback
26 import xml.parsers.expat
27 from wxPython.wx import wxGetApp
28
29 from lib.extra import dictclient
30 from lib.extra import dictdlib
31 from lib import info
32 from lib import misc
33 from lib import errortype
34 from lib import meta
35 from lib import plaindict
36 from lib.logger import systemLog, debugLog, DEBUG, INFO, WARNING, ERROR
37
38
39 WORD_BG = "#dde2f1" # Bright blue
40 DICT_BG = "#b4bedb"
41
42 class SlowoParser(plaindict.PlainDictionary):
43    """
44    Built-in Slowo Parser
45
46    Parses file in Slowo format.
47    """
48
49    def __init__(self, filePath):
50       """Initialize"""
51
52       self.filePath = filePath
53       self.needsList = True
54      
55       self.name = os.path.splitext(os.path.basename(filePath))[0]
56
57       # Additional information
58       self.encoding = None
59       self.checksum = None
60       self.index = None
61
62       self.configChanged = False
63
64
65    def start(self):
66       """Open file handle"""
67
68       debugLog(DEBUG, "Opening file %s" % self.filePath)
69       self.fd = open(self.filePath)
70
71
72    def stop(self):
73       """Close file handle"""
74
75       try:
76          debugLog(DEBUG, "Closing file %s" % self.filePath)
77          self.fd.close()
78       except:
79          pass
80
81
82    def setIndex(self, index):
83       """Set index table"""
84
85       self.index = index
86
87
88    def getPath(self):
89       """Return full file path"""
90
91       return self.filePath
92
93
94    def setChecksum(self, newSum, first=False):
95       """Set checksum. Used after checksum change"""
96
97       if self.checksum == None:
98          self.configChanged = True
99
100       self.checksum = newSum
101
102
103    def getChecksum(self):
104       """Return checksum"""
105
106       return self.checksum
107
108
109    def getType(self):
110       """Return dictionary type"""
111
112       from lib import dicttype
113       return dicttype.SLOWO
114
115
116    def setName(self, name):
117       """Set new name"""
118
119       self.name = name
120
121
122    def getName(self):
123       """Return file name"""
124
125       return self.name
126
127
128    def setEncoding(self, encoding):
129       """Set encoding"""
130
131       self.encoding = encoding
132       self.configChanged = True
133
134
135    def getEncoding(self):
136       """Return encoding set for that dictionary"""
137
138       return self.encoding
139
140
141    def getUsesWordList(self):
142       """Return True if uses word list, False otherwise"""
143
144       return self.needsList
145
146
147    def _appendTranslation(self, html, orig, trans):
148       """Appends HTML strings to list"""
149
150       html.append("<table width=\"100%\"><tr>")
151       html.append("<td bgcolor=\"%s\">" % WORD_BG)
152       html.append("<b>%s</b></td></tr>" % orig)
153       html.append("<tr><td>")
154       html.append("<p>%s</p>" % trans)
155       html.append("</td></tr></table>")
156      
157
158    def search(self, word):
159       """Lookup word"""
160
161       _start = time.time()
162
163       word_lowered = word.lower()
164
165       encodedIndex = {}
166       for literal in self.index:
167          encodedIndex[literal.encode(self.getEncoding())] = \
168                       self.index.get(literal)
169
170       #
171       # Seek to the beginning of the block
172       #
173       position = 0L
174       if word_lowered[:2] in encodedIndex.keys():
175          position = encodedIndex[word_lowered[:2]]
176
177       debugLog(DEBUG, "Index: %s->%d" % (word_lowered[:2], position))
178       debugLog(DEBUG, "SlowoParser: Seeking to %d" % position)
179      
180       self.fd.seek(position)
181
182       html = []
183
184       html.append("<html><head>")
185       html.append("<meta http-equiv=\"Content-Type\" " \
186                   "content=\"text/html; charset=%s\">" \
187                   % str(self.getEncoding()))
188       html.append("<head><body>")
189
190       found = False
191       words = []
192
193       result = meta.SearchResult()
194
195       # DEBUG
196       _linesRead = 0
197
198       for line in self.fd.xreadlines():
199          _linesRead += 1
200          line = line.strip()
201          try:
202             try:
203                 orig, end = line.split('=', 1)
204             except ValueError, e:
205                 systemLog(ERROR, '%s (line %s)' % (e, line))
206             orig = orig.strip()
207             chunks = end.split(';')
208
209             translation = ["<ul>"]
210             for chunk in chunks:
211                comment = []
212                trans = chunk.split('//')
213                
214                if len(trans) > 1:
215                   comment = trans[1:]
216
217                trans = trans[:1]
218                  
219                trans = "".join(trans).strip()
220                comment = "".join(comment).strip()
221                
222                if len(trans) and len(comment) != 0:
223                   translation.append("<li>%s (<i>%s</i>)</li>" \
224                                      % (trans, comment))
225                elif len(trans):
226                   translation.append("<li>%s</li>" % trans)
227
228             translation.append("</ul>")
229
230             translation = "".join(translation)
231
232          except:
233             traceback.print_exc()
234             continue
235
236          if line.lower().startswith(word_lowered):   
237            
238             if not orig.lower().startswith(word_lowered):
239                break
240            
241             if orig.lower() == word_lowered and not found:
242                found = True
243                self._appendTranslation(html, orig, translation)               
244                
245             words.append(orig)
246             if len(words) == 1:
247                suggestedWord = orig
248                suggestedTrans = translation
249          elif len(words):
250             break
251
252       debugLog(DEBUG, "%d lines scanned" % _linesRead)
253      
254       if not found:
255          if words:
256             self._appendTranslation(html, suggestedWord, suggestedTrans)
257          else:
258             result.setError(errortype.NOT_FOUND)
259
260       html.append("</font></body></html>")
261
262       try:
263          translation = "".join(html)
264       except:
265          result.setError(errortype.INVALID_ENCOFING)
266          translation = ""
267      
268       result.setTranslation(translation)
269       result.setWordList(words)
270
271       debugLog(DEBUG, "SlowoParser: search took %f seconds" \
272             % (time.time() - _start))
273
274       return result
275
276
277
278 class MovaParser(plaindict.PlainDictionary):
279    """
280    Built-in Mova Parser
281
282    Parses file in 'Mova' dictionary format and does
283    the search.
284    """
285
286    def __init__(self, filePath):
287       """Initialize"""
288
289       self.filePath = filePath
290       self.needsList = True
291
292       self.name = os.path.splitext(os.path.basename(filePath))[0]
293
294       # Additional variables
295       self.encoding = None
296       self.checksum = None
297       self.index = None
298      
299
300       # If this is True when closing, the new configuration will be
301       # written to disk
302       self.configChanged = False
303
304
305    def start(self):
306       """Open file handle"""
307
308       debugLog(DEBUG, "Opening file %s" % self.filePath)
309       self.fd = open(self.filePath)
310      
311
312    def stop(self):
313       """Close file handle"""
314
315       try:
316          debugLog(DEBUG, "Closing file %s" % self.filePath)
317          self.fd.close()
318       except:
319          pass
320          
321
322    def setIndex(self, index):
323       """Set index table"""
324
325       self.index = index
326
327
328    def getPath(self):
329       """Return full file path"""
330
331       return self.filePath
332
333
334    def setChecksum(self, newSum, first=False):
335       """Set checksum. Used after chekcsum change"""
336
337       if self.checksum == None:
338          self.configChanged = True
339
340       self.checksum = newSum
341
342       # If checksum is set not for the first time, remember to
343       # update configuration
344       #if not first:
345       #   self.configChanged = True
346
347
348    def getChecksum(self):
349       """Return checksum"""
350
351       return self.checksum
352      
353
354    def getType(self):
355       """Return dictionary type"""
356
357       from lib import dicttype
358       return dicttype.MOVA
359
360
361    def setName(self, name):
362       """Set new name"""
363
364       self.name = name
365
366
367    def getName(self):
368       """Return file name"""
369
370       return self.name
371
372
373    def setEncoding(self, encoding):
374       """Set encoding"""
375
376       self.encoding = encoding
377       self.configChanged = True
378
379
380    def getEncoding(self):
381       """Return encoding set for this dictionary"""
382
383       return self.encoding
384
385
386    def getUsesWordList(self):
387       """Return True if uses word list, False otherwise"""
388
389       return self.needsList
390
391
392    def _appendTranslation(self, html, orig, trans):
393       """Appends HTML strings to list"""
394
395       html.append("<table width=\"100%\"><tr>")
396       html.append("<td bgcolor=\"%s\">" % WORD_BG)
397       html.append("<b>%s</b></td></tr>" % orig)
398       html.append("<tr><td>")
399       html.append("<p>%s</p>" % trans)
400       html.append("</td></tr></table>")
401      
402
403    def search(self, word):
404       """Lookup word"""
405
406       _start = time.time()
407
408       word_lowered = word.lower()
409
410       encodedIndex = {}
411       for literal in self.index:
412          encodedIndex[literal.encode(self.getEncoding())] = \
413                       self.index.get(literal)
414
415       #
416       # Seek to the beginning of the block
417       #
418       position = 0L
419       if word_lowered[:2] in encodedIndex.keys():
420          position = encodedIndex[word_lowered[:2]]
421
422       debugLog(DEBUG, "Index: %s->%d" % (word_lowered[:2], position))
423       debugLog(DEBUG, "MovaParser: Seeking to %d" % position)
424       self.fd.seek(position)
425
426       html = []
427
428       html.append("<html><head>")
429       html.append("<meta http-equiv=\"Content-Type\" " \
430                   "content=\"text/html; charset=%s\">" \
431                   % str(self.getEncoding()))
432       html.append("<head><body>")
433
434       found = False
435       words = []
436
437       result = meta.SearchResult()
438
439       # DEBUG
440       _linesRead = 0
441
442       for line in self.fd.xreadlines():
443          _linesRead += 1
444          line = line.strip()
445          try:
446             orig, trans = line.split("  ", 1)
447          except:
448             continue
449
450          if line.lower().startswith(word_lowered):   
451            
452             if not orig.lower().startswith(word_lowered):
453                break
454            
455             if orig.lower() == word_lowered and not found:
456                found = True
457                self._appendTranslation(html, orig, trans)               
458                
459             words.append(orig)
460             if len(words) == 1:
461                suggestedWord = orig
462                suggestedTrans = trans
463          elif len(words):
464             break
465
466       debugLog(DEBUG, "%d lines scanned" % _linesRead)
467      
468       if not found:
469          if words:
470             self._appendTranslation(html, suggestedWord, suggestedTrans)
471          else:
472             result.setError(errortype.NOT_FOUND)
473
474       html.append("</font></body></html>")
475
476       try:
477          translation = "".join(html)
478       except:
479          result.setError(errortype.INVALID_ENCOFING)
480          translation = ""
481      
482       result.setTranslation(translation)
483       result.setWordList(words)
484
485       debugLog(DEBUG, "MovaParser: Search took %f seconds" \
486                % (time.time() - _start))
487
488       return result
489
490
491
492 # FIXME: Deprecated
493 class TMXParser(plaindict.PlainDictionary):
494     """Built-in TMX parser.
495     Reads TMX files and does the search.
496     """
497
498     def __init__(self, filePath):
499
500        systemLog(WARNING, "***")
501        systemLog(WARNING, "*** WARNING:")
502        systemLog(WARNING, "*** TMX implementation is fuzzy and should " \
503                  "not be used yet!")
504        systemLog(WARNING, "***")
505
506        self.name = os.path.splitext(os.path.basename(filePath))[0]
507        self.needsList = True
508        self.encoding = None
509
510        self.mapping = {}
511        self.header = {}
512        self.trans = []
513        self.inSeg = 0
514        self.lang = ""
515
516
517     def start(self):
518        """Allocate resources"""
519
520        parser = xml.parsers.expat.ParserCreate()
521        parser.StartElementHandler = self.startElement
522        parser.EndElementHandler = self.endElement
523        parser.CharacterDataHandler = self.charData
524
525        if file != "":
526           fd = open(file)
527           parser.Parse(fd.read(), 1)
528           fd.close()
529
530
531     def getType(self):
532       """Return dictionary type"""
533
534       return dicttype.TMX
535
536
537     def setName(self, name):
538       """Set new name"""
539
540       self.name = name
541
542
543     def getName(self):
544        """Return file name"""
545
546        return self.name
547
548
549     def setEncoding(self, encoding):
550       """Set encoding"""
551
552       self.encoding = encoding
553
554    
555     def getEncoding(self):
556        """Return encoding set for that dictionary"""
557        
558        return wxGetApp().config.encoding
559
560
561     def getUsesWordList(self):
562        """Return True if uses word list, False otherwise"""
563        
564        return self.needsList
565
566            
567     def startElement(self, name, attrs):
568        """Part of SAX parsing method"""
569
570        if name == "tu":
571           self.inTu = 1
572        elif name == "tuv":
573           self.inTuv = 1
574           self.lang = attrs["lang"]
575        elif name == "seg":
576           self.inSeg = 1
577        elif name == "header":
578           self.header["srclang"] = attrs["srclang"]
579           self.header["creationtool"] = attrs["creationtool"]
580           self.header["creationtoolversion"] = attrs["creationtoolversion"]
581           self.header["o-tmf"] = attrs["o-tmf"]
582           self.header["adminlang"] = attrs["adminlang"]
583           self.header["datatype"] = attrs["datatype"]
584           self.header["segtype"] = attrs["segtype"]
585
586
587     def endElement(self, name):
588        """Part of SAX parsing method"""
589
590        if name == "tu":
591           self.inTu = 0
592           self.mapping.setdefault(self.orig, []).extend(self.trans)
593           self.trans = []
594        elif name == "tuv":
595           self.inTuv = 0
596        elif name == "seg":
597           self.inSeg = 0
598
599
600     def charData(self, data):
601        """Part of SAX parsing method"""
602
603        if self.inSeg:
604           if self.lang == self.header["srclang"]:
605              self.orig = data
606           else:
607              self.trans.append(data)
608
609
610     def search(self, word):
611        """Lookup word"""
612        
613        errno = 0
614
615        result = "<html><head>" \
616                 "<meta http-equiv=\"Content-Type\" " \
617                 "content=\"text/html; charset=%s\">" \
618                 "</head><body>"
619                 #"<font face=\"%s\" size=\"%s\">" % (self.window.encoding,
620                 #                                    self.window.app.config.fontFace,
621                 #                                    self.window.app.config.fontSize)
622
623        keys = self.mapping.keys()
624        avail = []
625        found = False
626        word_lowered = word.lower()
627
628        for key in keys:
629           if key.lower().find(word_lowered) == 0:
630              avail.append(key)
631              if not found:
632                  result += "<u><b>%s</b></u><br>" % key
633                  result += "<table><tr><td>"
634                  result += "&nbsp;"*3+str("<br>"+"&nbsp;"*3).join(self.mapping[key])
635                  result += "</td></tr></table>"
636                  found = True
637
638        result += "</font></body></html>"
639
640        if len(avail) == 0:
641           errno = 1
642
643        return (result, avail, errno)
644
645
646     def makeHashTable(self):
647        pass
648          
649
650
651 class DictParser(plaindict.PlainDictionary):
652    """Built-in dictd dictionaries parser.
653    Reads dictd dictionaries and does the search.
654    """
655
656    def __init__(self, filePath):
657       """Initialize"""
658
659       self.filePath = filePath
660       self.needsList = True
661       self.name = os.path.splitext(os.path.splitext(os.path.basename(filePath))[0])[0]
662       self.encoding = 'UTF-8'
663       self.checksum = None
664
665       self.configChanged = False
666
667       self.dict = None
668       self.definitions = None
669
670
671    def start(self):
672       """Allocate resources"""
673
674       name = os.path.splitext(os.path.splitext(\
675          os.path.basename(self.filePath))[0])[0]
676       indexFile = os.path.join(os.path.dirname(self.filePath),
677                                name)
678       self.dict = dictdlib.DictDB(indexFile)
679
680
681    def stop(self):
682       """Free resources"""
683
684       if self.dict:
685          del self.dict
686
687
688    def getPath(self):
689       """Return full file path"""
690
691       return self.filePath
692
693
694    def getType(self):
695       """Return dictionary type"""
696
697       from lib import dicttype
698       return dicttype.DICT
699
700    
701    def setName(self, name):
702       """Set new name"""
703
704       self.name = name
705
706
707    def getName(self):
708       """Return file name"""
709      
710       return self.name
711
712
713    def setEncoding(self, encoding):
714       """Set encoding"""
715
716       self.encoding = encoding
717    
718
719    def getEncoding(self):
720       """Return encoding set for that dictionary"""
721
722       return self.encoding
723
724
725    def setChecksum(self, newSum):
726       """Set checksum. Used after chekcsum change"""
727
728       if self.checksum == None:
729          self.configChanged = True
730
731       self.checksum = newSum
732
733
734    def getUsesWordList(self):
735       """Return True if uses word list, False otherwise"""
736
737       return self.needsList
738
739
740    def _getTranslation(self, word):
741       """Return word and translation code without formatting
742       full HTML document"""
743
744       translations = self.dict.getdef(word)
745
746       orig = None
747       translation = None
748      
749       for source in translations:
750          chunks = source.split('\n')
751          map(string.strip, chunks)
752          
753          orig = chunks[0]
754          pron = re.findall("\[(.*?)\]", orig)
755          if len(pron) > 0:
756             orig = "<b>%s</b> [<i>%s</i>]" % \
757                    (orig.replace(" [%s]" % pron[0], ""), pron[0])
758          else:
759             orig = "<b>%s</b>" % orig
760
761          translation = ['<ul>']
762          for c in chunks[1:]:
763             if len(c) > 0:
764                translation.append("<li>%s</li>" % c)
765          translation.append('</ul>')
766
767          translation = "".join(translation)
768          
769          links = re.findall("{(.*?)}", translation)
770          for link in links:
771             translation = translation.replace("{%s}" % link,
772                                               "<a href=\"%s\">%s</a>" \
773                                               % (link, link))
774
775       return (orig, translation)
776
777
778    def search(self, word):
779       """Lookup word"""
780
781       _start = time.time()
782
783       result = meta.SearchResult()
784
785       word_lowered = word.lower()
786      
787       if self.definitions is None:
788          self.definitions = self.dict.getdeflist()
789          self.definitions.sort()
790
791       words = []
792
793       for definition in self.definitions:
794          if definition.lower().startswith(word_lowered):
795             words.append(definition)
796
797       html = []
798
799       html.append("<html><head>")
800       html.append("<meta http-equiv=\"Content-Type\" " \
801                   "content=\"text/html; charset=%s\">" \
802                   % str(self.getEncoding()))
803       html.append("<head><body>")
804
805       (orig, translation) = self._getTranslation(word)
806
807       if not translation:
808          if len(words):
809             debugLog(DEBUG, "Retrying search...")
810             _word = words[0]
811             orig, translation = self._getTranslation(_word)
812             if not translation:
813                result.setError(errortype.NOT_FOUND)
814          else:
815             result.setError(errortype.NOT_FOUND)
816             translation = ""
817
818       html.append("<table width=\"100%\"><tr>")
819       html.append("<td bgcolor=\"%s\">" % WORD_BG)
820       html.append("<b>%s</b></td></tr>" % orig)
821       html.append("<tr><td>")
822       html.append("<p>%s</p>" % translation)
823       html.append("</td></tr></table>")
824       html.append("</body></html>")
825
826       result.setTranslation("".join(html))
827       result.setWordList(words)
828
829       debugLog(DEBUG, "DictParser: Search took % f seconds" \
830             % (time.time() - _start))
831
832       return result
833
834
835 # TODO:
836 # 1. This is not a parser, move to another module
837 # 2. Add needed methods
838 #
839 class DictConnection(meta.Dictionary):
840    """Built-in DICT client
841    Connects to a DICT server abd does the search.
842    """
843
844    def __init__(self, server, port, db, strategy):
845
846       self.server = server
847       self.port = port
848       self.db = db
849       self.strategy = strategy
850       self.encoding = "UTF-8"
851       self.needsList = 0
852       self.name = 'Connection to DICT server'
853
854
855    def getUsesWordList(self):
856       """Return True if uses word list, False otherwise"""
857
858       return self.needsList
859
860
861    def setName(self, name):
862       """Set new name"""
863
864       self.name = name
865
866
867    def getName(self):
868       """Return name"""
869
870       return self.name
871
872
873    def setEncoding(self, encoding):
874       """Set encoding"""
875
876       self.encoding = encoding
877
878
879    def getEncoding(self):
880       """Return encoding"""
881
882       return self.encoding
883
884
885    def search(self, word):
886       """Lookup word"""
887
888       result = meta.SearchResult()
889
890       try:
891          conn = dictclient.Connection(self.server, self.port)
892       except:
893          result.setError(errortype.CONNECTION_ERROR)
894          return result
895
896       html = []
897       html.append("<html><head>" \
898                   "<meta http-equiv=\"Content-Type\" " \
899                   "content=\"text/html; charset=%s\">" \
900                   "</head><body>" % self.getEncoding())
901
902       found = False
903
904       try:
905          data = conn.define(self.db, word)
906       except:
907          data = []
908
909       for d in data:
910          found = True
911
912          html.append("<p><table width=\"100%\"><tr>")
913          html.append("<td bgcolor=\"%s\">" % DICT_BG)
914          html.append("<b><i>%s</i></b></td></tr>" % d.getdb().getdescription())
915
916          source = d.getdefstr()
917          source = source.replace('<', '&lt;')
918          source = source.replace('>', '&gt;')
919          orig = source.split("\n", 1)[0]
920          
921          pron = re.findall("\[(.*?)\]", orig) # 1st comment type
922          pronPatt = " [%s]"
923          
924          if len(pron) == 0:
925             pron = re.findall("\/(.*?)\/", orig) # 2nd comment type
926             pronPatt = " /%s/"
927          if len(pron) == 0:
928             pron = re.findall(r"\\(.*?)\\", orig) # 3rd comment type
929             pronPatt = " \\%s\\"
930          
931          if len(pron) > 0:
932             orig = "<b>%s</b> [<i>%s</i>]" % \
933                    (orig.replace(pronPatt % pron[0], ""), pron[0])
934          else:
935             orig = "<b>%s</b>" % orig
936
937          html.append("<tr><td bgcolor=\"%s\">" % WORD_BG)
938          html.append("%s</td></tr>" % orig)
939
940          source = source.replace('\n\n', '<br><br>')
941          
942          translation = ' '.join(source.split('\n')[:])
943          links = re.findall("{(.*?)}", translation)
944          for link in links:
945             translation = translation.replace("{%s}" % link,
946                               "<a href=\"%s\">%s</a>" % (link, link))
947          html.append("<tr><td>%s</td></tr>" % translation)
948          html.append("</table></p>")
949
950       html.append("</body></html>")
951
952       result.setTranslation(''.join(html))
953      
954       if not found:
955          result.setError(errortype.NOT_FOUND)
956
957       return result
Note: See TracBrowser for help on using the browser.