Logo Search packages:      
Sourcecode: babiloo version File versions  Download package

sdictem.py

#!/usr/bin/python

#    Copyright (C) 2008-2010 Ivan Garcia <contact@ivangarcia.org>
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License along
#    with this program; if not, write to the Free Software Foundation, Inc.,
#    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

import os
import sys
import zlib
import struct
import time

from dictionary import DictionaryType, DictFormatError,  Definition

#prinfo  = lambda x: sys.stdout.write(x+"\n")
#prerror = lambda x: sys.stderr.write(x+"\n")
prinfo = lambda x: 0
prerror = lambda x: 0

COMPRESSOR_NONE          = 'none'
COMPRESSOR_GZIP          = 'gzip'
COMPRESSOR_BZIP2         = 'bzip2'
SDICT_SIG                = 'sdct'
SDICT_HEADER_SIZE        = 43
SDICT_SEARCH_FORWARD     = 15000
SDICT_SHORT_NDX_LEN      = 3

W_LANG_POS         = 4
A_LANG_POS         = 7
COMPRESSOR_POS     = 0x0a
WORDS_TOT_PTR_POS  = 0x0b
SINDEX_TOT_PTR_POS = 0x0f
TITLE_PTR_POS      = 0x13
COPYRIGHT_PTR_POS  = 0x17
VERSION_PTR_POS    = 0x1b
SINDEX_PTR_POS     = 0x1f
FINDEX_PTR_POS     = 0x23
ARTICLES_PTR_POS   = 0x27

PyDICT_SIG  = 'pydict'

class SDictionary(DictionaryType):

    def __init__(self, file_name, encoding = "utf-8"):
        self.init = False
        self.curr_file = None
        DictionaryType.__init__(self, file_name)

    def __len__(self):
        return self.header['words_total']

    def __getitem__(self, key):
        if isinstance(key, int):
            key = self.index_cache[key]
        pos = self.wordList_cache[key]
        return (key, Definition(self.read_pos(pos)))

    def get_words(self,patt):
        return (self.search_word(patt,True) or [])

    def search_word(self,word,listonly=False):
        prinfo("Searching for %s"%repr(word))
        search_pos = -1
        ln = len(word)
        subw = word[:3]
        list = []

        for i in range(1,4):
            if i == 1:
                ref = self.sindex_1
            elif i == 2:
                ref = self.sindex_2
            else:
                ref = self.sindex_3
            for j in ref:
                (wo, ndx) = j
                #print i, j, wo, ndx, subw
                if wo[:i] == subw[:i]:
                    #print 'found in ', i, ' wo: ', wo, ' ndx: ', ndx
                    search_pos = ndx
                    #continue
                    break

        if search_pos < 0:
            prinfo("Not found")
            return None

        prinfo("Scanning from pos %d"%search_pos)
        findes_saved = self.f_index_pos_cur
        self.f_index_pos_cur = search_pos + self.f_index_pos

        for x in range(1,SDICT_SEARCH_FORWARD):
            prev_pos = self.f_index_pos_cur
            nw = self.get_next_word()

            if nw is None:
                self.f_index_pos_cur = findes_saved
                prinfo("Not found")
                return None

            if listonly:
                #print 'lo:', repr(word), repr(nw)
                if word == nw[:ln]:
                    list.append(nw)
                    continue
                else:
                    return list

            #print repr(word), repr(nw)
            if word[:3] != nw[:3]:
                prinfo("Not found")
                return None

            if word == nw:
                art = self.read_unit(self.cur_word_pos + self.articles_pos)
                return art

        prinfo("Not found")
        return None

    def load_dictionary_fast(self,file_name):
        prinfo("Try to load '%s'"%file_name)
        self.infile = file_name
        prinfo('Reading header')
        if not self.read_header():
            return False
#       prinfo('Reading short index fast')
#       if not self.read_short_index_fast():
#           return False
        self.f_index_pos_cur = self.f_index_pos
        return True

    def get_next_word(self):
        file_name = self.infile_handler
        fpos = self.f_index_pos_cur
        file_name.seek(fpos)
        hdr = file_name.read(8)
        # next = struct.unpack('H', hdr[:2])[0]
        (next,prev,aptr) = struct.unpack('=HHL', hdr)
        if not next:
            prinfo('Last word reached')
            return None
        # aptr = struct.unpack('L', hdr[4:])[0]
        # wlen = next - 4 - 2 - 2
        wlen = next - 8
        if wlen < 0:
            prerror('File format error')
            sys.exit(1)
        word = file_name.read(wlen)
        self.cur_word = word
        self.cur_word_pos = aptr
        self.f_index_pos_cur += wlen + 8
        #return word.decode('utf-8')
        return (word, aptr+self.articles_pos)

    def unload_dictionary(self):
        prinfo('Unloading dictionary')
        self.infile_handler.close()
        self.words_list   = None
        self.words_hash   = None
        self.sindex_hash  = None
        self.header       = None
        self.sindex_1     = None
        self.sindex_2     = None
        self.sindex_3     = None
        self.infile       = None
        self.init         = False
        return True

    def read_header(self):
        infile = self.infile
        try:
            file_name = open(infile, 'rb')
            hdr = file_name.read(SDICT_HEADER_SIZE)
            if hdr[:4] != SDICT_SIG:
                prerror("Wrong signature file '%s'"%infile)
                return False
        except:
            return False

        self.infile_handler = file_name

        w_lang = hdr[W_LANG_POS:W_LANG_POS+3].strip("\0")
        a_lang = hdr[A_LANG_POS:A_LANG_POS+3].strip("\0")

        compr = int(hdr[COMPRESSOR_POS:COMPRESSOR_POS+1])
        if compr == 0:
            compr_method = COMPRESSOR_NONE
        elif compr == 1:
            compr_method = COMPRESSOR_GZIP
        else:
            prerror("Wrong compression type '%s'"%compr)
            return False

        self.compressor = compr_method

        (tot_words,
        sindex_total,
        title_ptr,
        copyr_ptr,
        version_ptr,
        sindex_pos,
        f_index_ptr,
        articles_ptr) = struct.unpack('=LLLLLLLL', hdr[WORDS_TOT_PTR_POS:])

        title = self.read_unit(title_ptr)
        if title is None:
            prerror('Unable to read title')
            return False

        copyright = self.read_unit(copyr_ptr)
        if copyright is None:
            prerror('Unable to read copyright')
            return False

        version = self.read_unit(version_ptr)
        if version is None:
            prerror('Unable to read version')
            return False

        prinfo('Dictionary information:')
        prinfo("   Title: '%s'"%title)
        prinfo("   Copyright: '%s'"%copyright)
        prinfo("   Version: '%s'"%version)
        prinfo("   Langs: %s/%s"%(w_lang,a_lang))
        prinfo("   Words: %s"%tot_words)
        prinfo("   Short index: %s"%sindex_total)
        prinfo("   Compression: %s"%compr_method)
        prinfo(' ')
        prinfo("   Short index offset: 0x%x"%sindex_pos)
        prinfo("   Full index offset : 0x%x"%f_index_ptr)
        prinfo("   Articles offset   : 0x%x"%articles_ptr)
        prinfo(' ')

        self.init = True
        self.f_index_pos = f_index_ptr
        self.articles_pos = articles_ptr
        self.header = {}
        self.header['title'       ] = title;
        self.header['copyright'   ] = copyright;
        self.header['version'     ] = version;
        self.header['w_lang'      ] = w_lang;
        self.header['a_lang'      ] = a_lang;
        self.header['words_total' ] = tot_words;
        self.header['sindex_total'] = sindex_total;
        self.header['sindex_ptr'  ] = sindex_pos;
        self.header['f_index_pos' ] = f_index_ptr;
        self.header['articles_pos'] = articles_ptr;
        return True

    def read_unit(self,fpos,raw=False):
        file_name = self.infile_handler
        file_name.seek(fpos)
        unit = file_name.read(4)
        art = file_name.read(struct.unpack('=L',unit)[0])
        if raw:
            return art
        if self.compressor == COMPRESSOR_NONE:
            return art.decode('utf-8')
        if self.compressor == COMPRESSOR_GZIP:
            return zlib.decompress(art).decode('utf-8')
        prerror('Wrong compression type')
        return None

    def load(self,file_name):
        self.curr_file = file_name
        self.wordList_cache = {}
        self.index_cache = []
        if self.init:
            self.unload_dictionary()
        if not self.load_dictionary_fast(file_name):
            raise DictFormatError, "Not a valid SDict dictionary"
        self.wordList()
        self._DictionaryType__keys = self.get_index()

    def reload(self):
        return self.load(self.curr_file)

    def rewind(self):
        self.f_index_pos_cur = self.f_index_pos

    def wordList(self):
        if not self.wordList_cache:
            self.rewind()
            result = {}
            while 1:
                next_word = self.get_next_word()
                if not next_word:
                    break
                word, pos = next_word
                result[word.decode('utf8')] = pos
            self.wordList_cache = result
        return self.wordList_cache

    def get_index(self):
        if not self.index_cache:
            self.rewind()
            while 1:
                word_pos = self.get_next_word()
                if not word_pos:
                    break
                self.index_cache.append(word_pos[0].decode('utf8'))
        return self.index_cache

    def get_word_pos(self, word):
        if not self.wordList_cache:
            while 1:
                next_word = self.get_next_word()
                if not next_word:
                    break
                _word, pos = next_word
                if _word == word:
                    return pos
            return None
        return self.wordList_cache[word]

    def lookup(self, word):
        pos = self.get_word_pos(word)
        return self.read_pos(pos)

    def read_pos(self, pos):
        return self.read_unit(pos)

    def close( self ):
        return self.unload_dictionary()

    def getTypeStr(self):
        return "SDictEm"

    def getName(self):
        return self.header['title']

    def getCopyright(self):
        return self.header['copyright']

    def getVersion(self):
        return self.header['version']

def t1(func):
    timing = time.time()
    func()
    print 'timing:', time.time() - timing

def t2(func, args):
    timing = time.time()
    func(args)
    print 'timing:', time.time() - timing

if __name__ == '__main__':
    sd = Sdict()
    try:
        sd.load(sys.argv[1])
    except:
        print "Usage", sys.argv[0], "<filename>.dct"
        sys.exit(1)

    print "Get..."
    poses = {}
    list = []
    while True:
        word = sd.get_next_word()
        if word is None:
            break
        pos = word[1]
        if poses.get(pos):
            continue
        poses[pos] = True
        list.append(word)

    print "Sort..."
    def sort_func(x,y):
        x = x[0].decode('utf-8').lower()
        y = y[0].decode('utf-8').lower()
        if x < y:
            return -1
        if x > y:
            return 1
        return 0
    list.sort(sort_func)

    print "Compile..."
    pid = os.getpid()
    b_file_name = "%d.b"%pid
    w_file_name = "%d.w"%pid
    a_file_name = "%d.a"%pid
    b = open(b_file_name,'wb')
    w = open(w_file_name,'wb')
    a = open(a_file_name,'wb')

    wrong = 0
    for x in list:
        (word,apos) = x
        art = sd.read_unit(apos,True)
        if len(art) > 0xffff:
            wrong += 1
            continue
        if len(word) > 255:
            word = word.decode('utf-8')[:128].encode('utf-8')
        bitem = struct.pack('LLHB',w.tell(),a.tell(),len(art),len(word))
        b.write(bitem)
        w.write(word)
        a.write(art)

    print "Create pydict file..."
    dict_file_name = os.path.basename(sys.argv[1]).replace('.dct','.pd')
    hdr_len = len(PyDICT_SIG) + struct.calcsize('=LLL')
    hdr = PyDICT_SIG + struct.pack('=LLL', len(list)-wrong, b.tell()+hdr_len, b.tell()+w.tell()+hdr_len)

    b.close()
    w.close()
    a.close()
    b = open(b_file_name,'rb')
    w = open(w_file_name,'rb')
    a = open(a_file_name,'rb')

    d = open(dict_file_name,'wb')
    d.write(hdr)
    d.write(b.read())
    d.write(w.read())
    d.write(a.read())
    d.close()

    b.close()
    w.close()
    a.close()
    os.unlink(b_file_name)
    os.unlink(w_file_name)
    os.unlink(a_file_name)

    print "Done"
    sys.exit(0)

Generated by  Doxygen 1.6.0   Back to index